Implementing extractStructuralTokens as a helper to detection engine
Some checks are pending
/ build (macos-latest, 3.8) (push) Waiting to run
/ build (ubuntu-latest, pypy-2.7) (push) Waiting to run
/ build (windows-latest, 3.14) (push) Waiting to run

This commit is contained in:
Miroslav Štampar 2026-06-30 23:09:06 +02:00
parent f932a3f30f
commit 74f90df8ae
6 changed files with 87 additions and 7 deletions

View file

@ -162,13 +162,13 @@ df768bcb9838dc6c46dab9b4a877056cb4742bd6cfaaf438c4a3712c5cc0d264 extra/shutils/
1966ca704961fb987ab757f0a4afddbf841d1a880631b701487c75cef63d60c3 extra/vulnserver/__init__.py
617cec1b731e0baacafa6f58c2f56a85b6128d1416627cc1b2f61519c8539a2e extra/vulnserver/vulnserver.py
a2bf70d7f87c3a4e0675c0bad54119a4e04efa6ea2730a8338d5aebcd995630e lib/controller/action.py
9137a8f7368496c84b21944f6b94c28004d3a2a849ac9c8e0b20e294e4c4a93a lib/controller/checks.py
f4fb3839e5accd1b58b34226e4b26f5079d9696e24d335d37d870cd5e62d1e80 lib/controller/checks.py
666935b658074dc9c42153622b75d4ec7bfe56fbe0742de827a5d30a1a0f9d96 lib/controller/controller.py
d69e84f1648cdb907f5d2dd454f03874a4613752b07867510145d51d84b3c56f lib/controller/handler.py
1966ca704961fb987ab757f0a4afddbf841d1a880631b701487c75cef63d60c3 lib/controller/__init__.py
9c5764c92ce536d1f0f96200359ee5ef1f37f9128769bf990cb77f1d1f8e17b1 lib/core/agent.py
c51c33501cc905586a9aaac93b06f2ac6f71628d032a7dc39fd0ef05d7ee3856 lib/core/bigarray.py
122767794156afa41b19baa706ad4c124eef6eaf73ed8fd208d8f634e97e82eb lib/core/common.py
d143df718fbaacb617b6046c73cf4e47932e1a25928a4e1ecb87ea77a3b154ed lib/core/common.py
8f1272487e1adfcc8c755a2f56f0c6d21eac5e685a73a9a159482f9dc9142bc5 lib/core/compat.py
a683d0ad9ba543587382c4903d28db610ae20394fcf9045a68b2ab54a39381ae lib/core/convert.py
c03dc585f89642cfd81b087ac2723e3e1bb3bfa8c60e6f5fe58ef3b0113ebfe6 lib/core/data.py
@ -182,14 +182,14 @@ f8de57606325456928e46ae2896f5f8bbec9ad18b1c644b492a566fa992216f6 lib/core/decor
1966ca704961fb987ab757f0a4afddbf841d1a880631b701487c75cef63d60c3 lib/core/__init__.py
914a13ee21fd610a6153a37cbe50830fcbd1324c7ebc1e7fc206d5e598b0f7ad lib/core/log.py
5a576f802f1298d0aa357e766ae6502fa53cacbbe0b1d328b7410a8b20a885b2 lib/core/optiondict.py
e033b20a0f7821797a10f4bf4235723f38c7db551c611fbb713faa621b123c4a lib/core/option.py
98d3d61278794705c7039e40fab66a626e8d6ab765383c5379cec7a066b09301 lib/core/option.py
21b2b1745107c211fc7593923a3da7a808d40763c00091c28de5f7c129bcf3bc lib/core/patch.py
49c0fa7e3814dfda610d665ee02b12df299b28bc0b6773815b4395514ddf8dec lib/core/profiling.py
0c36a65b6237732eb001d333f80f0c58c088ff01ae80cf07e4dcc6da2a806364 lib/core/readlineng.py
9bf174058f15d14e24e94f9aaf42df045119d3617c6c54bd2f3af79b462f331d lib/core/replication.py
0b8c38a01bb01f843d94a6c5f2075ee47520d0c4aa799cecea9c3e2c5a4a23a6 lib/core/revision.py
888daba83fd4a34e9503fe21f01fef4cc730e5cde871b1d40e15d4cbc847d56c lib/core/session.py
098e5d86a0da05d4be5f5ed5371083954be2369abce57fda4bd906d12e1f8870 lib/core/settings.py
a2fb281b59c4526613f22fc0e994b68db91c1263db415aa86002ec4e20773639 lib/core/settings.py
c7804223319e18eb0b8e2cbf0a8b6896d1cefb7b0b1a2e9f1cf826a8a3b56750 lib/core/shell.py
a2e98a94b231432736d6b304fc75525c8b5fdb4768c418387c5b4c1a610dad64 lib/core/subprocessng.py
19f1e3c5e3ba703d28d510cd7a9ab8284d5fbe9df5ce7e77c86e5931571364b7 lib/core/target.py
@ -211,7 +211,7 @@ c2f34e27578742e729c2fa9c1d4f0a0d8f8f7f4cf0fc14c62ec817a260c71dec lib/parse/site
1be3da334411657461421b8a26a0f2ff28e1af1e28f1e963c6c92768f9b0847c lib/request/basicauthhandler.py
369484a2999d29f49bf839a329d1686ed94f6ea27c695e027fe08c8da51f30a3 lib/request/basic.py
bc61bc944b81a7670884f82231033a6ac703324b34b071c9834886a92e249d0e lib/request/chunkedhandler.py
d4bb0869b03602a0c8f9e0e0fd217753f14ddadf848fc9f3c65a74d03feb9958 lib/request/comparison.py
9c0dccc1cee66d38478aaf75a7c513d0d136d50a90b15fed146faa1653899fe1 lib/request/comparison.py
729e07a2ca6b1d83563e9c6dc5a884d1b664c1764be06776ea93bde305164f0c lib/request/connect.py
8e06682280fce062eef6174351bfebcb6040e19976acff9dc7b3699779783498 lib/request/direct.py
a6b37b436838caeb197fea858d0a39fadbff4736256e741b5fcec1f28fcf1ce0 lib/request/dns.py

View file

@ -16,6 +16,7 @@ from extra.beep.beep import beep
from lib.core.agent import agent
from lib.core.common import Backend
from lib.core.common import extractRegexResult
from lib.core.common import extractStructuralTokens
from lib.core.common import extractTextTagContent
from lib.core.common import filterNone
from lib.core.common import findDynamicContent
@ -1390,7 +1391,26 @@ def checkStability():
raise SqlmapNoneDataException(errMsg)
else:
checkDynamicContent(firstPage, secondPage)
# Before engaging the (lossy) dynamic-content removal / '--text-only' escalation, check
# whether the page is structurally stable (identical tag/class/id skeleton across the two
# requests) despite differing text. If so, base the comparison on that value-free structure
# so that dynamic content (e.g. per-render result rows) does not mask an injection. This is
# the HTML counterpart of the structure-aware JSON comparison
if firstPage and secondPage and extractStructuralTokens(firstPage) == extractStructuralTokens(secondPage):
kb.pageStructurallyStable = True
if kb.nullConnection:
debugMsg = "turning off NULL connection "
debugMsg += "support because of structural page comparison"
logger.debug(debugMsg)
kb.nullConnection = None
infoMsg = "target URL content is not byte-stable but structurally stable; sqlmap "
infoMsg += "will base the page comparison on the page structure"
logger.info(infoMsg)
else:
checkDynamicContent(firstPage, secondPage)
return kb.pageStable

View file

@ -176,6 +176,9 @@ from lib.core.settings import REPLACEMENT_MARKER
from lib.core.settings import SENSITIVE_DATA_REGEX
from lib.core.settings import SENSITIVE_OPTIONS
from lib.core.settings import STDIN_PIPE_DASH
from lib.core.settings import STRUCTURAL_CLASS_REGEX
from lib.core.settings import STRUCTURAL_ID_REGEX
from lib.core.settings import STRUCTURAL_TAG_REGEX
from lib.core.settings import SUPPORTED_DBMS
from lib.core.settings import TEXT_TAG_REGEX
from lib.core.settings import TIME_STDEV_COEFF
@ -3227,6 +3230,45 @@ def extractTextTagContent(page):
return filterNone(_.group("result").strip() for _ in re.finditer(TEXT_TAG_REGEX, page))
def extractStructuralTokens(page):
"""
Returns a set of value-free structural tokens (tag names and class/id attribute hooks) of a
(HTML) page, discarding all textual content. Used for structure-aware page comparison when the
page is byte-unstable but structurally stable (e.g. dynamic result rows in a fixed layout), so
that dynamic text does not perturb the comparison while a structural change (e.g. a results
table appearing or disappearing) still does. HTML counterpart of jsonMinimize()
>>> sorted(extractStructuralTokens(u'<div id="g" class="a b"><span>x</span></div>')) == [u'cls:div.a', u'cls:div.b', u'id:div#g', u'tag:div', u'tag:span']
True
>>> extractStructuralTokens(u'<table><tr><td>1</td></tr></table>') == set([u'tag:table', u'tag:tr', u'tag:td'])
True
>>> extractStructuralTokens(u'') == set()
True
"""
page = page or ""
if REFLECTED_VALUE_MARKER in page:
page = re.sub(r"(?i)<[^>]*%s[^>]*>" % REFLECTED_VALUE_MARKER, " ", page)
page = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>", " ", page)
retVal = set()
for match in re.finditer(STRUCTURAL_TAG_REGEX, page):
tag = match.group(1).lower()
attrs = match.group(2) or ""
retVal.add("tag:%s" % tag)
for _ in re.finditer(STRUCTURAL_CLASS_REGEX, attrs):
for value in (_.group(1) or _.group(2) or _.group(3) or "").split():
retVal.add("cls:%s.%s" % (tag, value))
for _ in re.finditer(STRUCTURAL_ID_REGEX, attrs):
value = (_.group(1) or _.group(2) or _.group(3) or "").strip()
if value:
retVal.add("id:%s#%s" % (tag, value))
return retVal
def trimAlphaNum(value):
"""
Trims alpha numeric characters from start and ending of a given value

View file

@ -2210,6 +2210,7 @@ def _setKnowledgeBaseAttributes(flushAll=True):
kb.pageTemplates = dict()
kb.pageEncoding = DEFAULT_PAGE_ENCODING
kb.pageStable = None
kb.pageStructurallyStable = None
kb.partRun = None
kb.permissionFlag = False
kb.place = None

View file

@ -20,7 +20,7 @@ from lib.core.enums import OS
from thirdparty import six
# sqlmap version (<major>.<minor>.<month>.<monthly commit>)
VERSION = "1.10.6.198"
VERSION = "1.10.6.199"
TYPE = "dev" if VERSION.count('.') > 2 and VERSION.split('.')[-1] != '0' else "stable"
TYPE_COLORS = {"dev": 33, "stable": 90, "pip": 34}
VERSION_STRING = "sqlmap/%s#%s" % ('.'.join(VERSION.split('.')[:-1]) if VERSION.count('.') > 2 and VERSION.split('.')[-1] == '0' else VERSION, TYPE)
@ -180,6 +180,13 @@ DUMMY_SEARCH_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100
# Regular expression used for extracting content from "textual" tags
TEXT_TAG_REGEX = r"(?si)<(abbr|acronym|b|blockquote|br|center|cite|code|dt|em|font|h[1-6]|i|li|p|pre|q|strong|sub|sup|td|th|title|tt|u)(?!\w).*?>(?P<result>[^<]+)"
# Regular expressions used for extracting a value-free structural skeleton of a (HTML) page (tag
# names and class/id attribute hooks), for structure-aware comparison of pages whose textual
# content is dynamic but whose layout is stable
STRUCTURAL_TAG_REGEX = r"(?si)<\s*([a-z][a-z0-9]*)((?:\s+[^<>]*)?)/?>"
STRUCTURAL_CLASS_REGEX = r"""(?si)\bclass\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
STRUCTURAL_ID_REGEX = r"""(?si)\bid\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
# Regular expression used for recognition of IP addresses
IP_ADDRESS_REGEX = r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b"

View file

@ -10,6 +10,7 @@ from __future__ import division
import re
from lib.core.common import extractRegexResult
from lib.core.common import extractStructuralTokens
from lib.core.common import getFilteredPageContent
from lib.core.common import jsonMinimize
from lib.core.common import listToStrValue
@ -177,6 +178,15 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
seq1 = jsonMinimize(kb.pageTemplate)
seq2 = jsonMinimize(rawPage)
# Structure-aware comparison for a structurally-stable (but byte-unstable) HTML page:
# compare the value-free tag/class/id skeleton so dynamic text does not perturb the ratio
# while a structural change (e.g. a results table appearing/disappearing) still does
if seq1 is None and kb.pageStructurallyStable and not (conf.titles or conf.textOnly or kb.nullConnection):
_ = "\n".join(sorted(extractStructuralTokens(kb.pageTemplate)))
if _: # only engage when the page actually exposes structure (HTML tags); tagless content falls back to text
seq1 = _
seq2 = "\n".join(sorted(extractStructuralTokens(rawPage)))
if seq1 is None or seq2 is None:
if conf.titles:
seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a)