Implementing extractStructuralTokens as a helper to detection engine

2026-07-01 14:01:09 +00:00 · 2026-06-30 23:09:06 +02:00 · 2026-06-30 23:09:06 +02:00 · 74f90df8ae
commit 74f90df8ae
parent f932a3f30f
6 changed files with 87 additions and 7 deletions
--- a/lib/controller/checks.py
+++ b/lib/controller/checks.py
@ -16,6 +16,7 @@ from extra.beep.beep import beep
 from lib.core.agent import agent
 from lib.core.common import Backend
 from lib.core.common import extractRegexResult
+from lib.core.common import extractStructuralTokens
 from lib.core.common import extractTextTagContent
 from lib.core.common import filterNone
 from lib.core.common import findDynamicContent
@ -1390,7 +1391,26 @@ def checkStability():
                raise SqlmapNoneDataException(errMsg)

        else:
-            checkDynamicContent(firstPage, secondPage)
+            # Before engaging the (lossy) dynamic-content removal / '--text-only' escalation, check
+            # whether the page is structurally stable (identical tag/class/id skeleton across the two
+            # requests) despite differing text. If so, base the comparison on that value-free structure
+            # so that dynamic content (e.g. per-render result rows) does not mask an injection. This is
+            # the HTML counterpart of the structure-aware JSON comparison
+            if firstPage and secondPage and extractStructuralTokens(firstPage) == extractStructuralTokens(secondPage):
+                kb.pageStructurallyStable = True
+
+                if kb.nullConnection:
+                    debugMsg = "turning off NULL connection "
+                    debugMsg += "support because of structural page comparison"
+                    logger.debug(debugMsg)
+
+                    kb.nullConnection = None
+
+                infoMsg = "target URL content is not byte-stable but structurally stable; sqlmap "
+                infoMsg += "will base the page comparison on the page structure"
+                logger.info(infoMsg)
+            else:
+                checkDynamicContent(firstPage, secondPage)

    return kb.pageStable

--- a/lib/core/common.py
+++ b/lib/core/common.py
@ -176,6 +176,9 @@ from lib.core.settings import REPLACEMENT_MARKER
 from lib.core.settings import SENSITIVE_DATA_REGEX
 from lib.core.settings import SENSITIVE_OPTIONS
 from lib.core.settings import STDIN_PIPE_DASH
+from lib.core.settings import STRUCTURAL_CLASS_REGEX
+from lib.core.settings import STRUCTURAL_ID_REGEX
+from lib.core.settings import STRUCTURAL_TAG_REGEX
 from lib.core.settings import SUPPORTED_DBMS
 from lib.core.settings import TEXT_TAG_REGEX
 from lib.core.settings import TIME_STDEV_COEFF
@ -3227,6 +3230,45 @@ def extractTextTagContent(page):

    return filterNone(_.group("result").strip() for _ in re.finditer(TEXT_TAG_REGEX, page))

+def extractStructuralTokens(page):
+    """
+    Returns a set of value-free structural tokens (tag names and class/id attribute hooks) of a
+    (HTML) page, discarding all textual content. Used for structure-aware page comparison when the
+    page is byte-unstable but structurally stable (e.g. dynamic result rows in a fixed layout), so
+    that dynamic text does not perturb the comparison while a structural change (e.g. a results
+    table appearing or disappearing) still does. HTML counterpart of jsonMinimize()
+
+    >>> sorted(extractStructuralTokens(u'<div id="g" class="a b"><span>x</span></div>')) == [u'cls:div.a', u'cls:div.b', u'id:div#g', u'tag:div', u'tag:span']
+    True
+    >>> extractStructuralTokens(u'<table><tr><td>1</td></tr></table>') == set([u'tag:table', u'tag:tr', u'tag:td'])
+    True
+    >>> extractStructuralTokens(u'') == set()
+    True
+    """
+
+    page = page or ""
+
+    if REFLECTED_VALUE_MARKER in page:
+        page = re.sub(r"(?i)<[^>]*%s[^>]*>" % REFLECTED_VALUE_MARKER, " ", page)
+
+    page = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>", " ", page)
+
+    retVal = set()
+
+    for match in re.finditer(STRUCTURAL_TAG_REGEX, page):
+        tag = match.group(1).lower()
+        attrs = match.group(2) or ""
+        retVal.add("tag:%s" % tag)
+        for _ in re.finditer(STRUCTURAL_CLASS_REGEX, attrs):
+            for value in (_.group(1) or _.group(2) or _.group(3) or "").split():
+                retVal.add("cls:%s.%s" % (tag, value))
+        for _ in re.finditer(STRUCTURAL_ID_REGEX, attrs):
+            value = (_.group(1) or _.group(2) or _.group(3) or "").strip()
+            if value:
+                retVal.add("id:%s#%s" % (tag, value))
+
+    return retVal
+
 def trimAlphaNum(value):
    """
    Trims alpha numeric characters from start and ending of a given value
--- a/lib/core/option.py
+++ b/lib/core/option.py
@ -2210,6 +2210,7 @@ def _setKnowledgeBaseAttributes(flushAll=True):
    kb.pageTemplates = dict()
    kb.pageEncoding = DEFAULT_PAGE_ENCODING
    kb.pageStable = None
+    kb.pageStructurallyStable = None
    kb.partRun = None
    kb.permissionFlag = False
    kb.place = None
--- a/lib/core/settings.py
+++ b/lib/core/settings.py
@ -20,7 +20,7 @@ from lib.core.enums import OS
 from thirdparty import six

 # sqlmap version (<major>.<minor>.<month>.<monthly commit>)
-VERSION = "1.10.6.198"
+VERSION = "1.10.6.199"
 TYPE = "dev" if VERSION.count('.') > 2 and VERSION.split('.')[-1] != '0' else "stable"
 TYPE_COLORS = {"dev": 33, "stable": 90, "pip": 34}
 VERSION_STRING = "sqlmap/%s#%s" % ('.'.join(VERSION.split('.')[:-1]) if VERSION.count('.') > 2 and VERSION.split('.')[-1] == '0' else VERSION, TYPE)
@ -180,6 +180,13 @@ DUMMY_SEARCH_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100
 # Regular expression used for extracting content from "textual" tags
 TEXT_TAG_REGEX = r"(?si)<(abbr|acronym|b|blockquote|br|center|cite|code|dt|em|font|h[1-6]|i|li|p|pre|q|strong|sub|sup|td|th|title|tt|u)(?!\w).*?>(?P<result>[^<]+)"

+# Regular expressions used for extracting a value-free structural skeleton of a (HTML) page (tag
+# names and class/id attribute hooks), for structure-aware comparison of pages whose textual
+# content is dynamic but whose layout is stable
+STRUCTURAL_TAG_REGEX = r"(?si)<\s*([a-z][a-z0-9]*)((?:\s+[^<>]*)?)/?>"
+STRUCTURAL_CLASS_REGEX = r"""(?si)\bclass\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
+STRUCTURAL_ID_REGEX = r"""(?si)\bid\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
+
 # Regular expression used for recognition of IP addresses
 IP_ADDRESS_REGEX = r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b"

--- a/lib/request/comparison.py
+++ b/lib/request/comparison.py
@ -10,6 +10,7 @@ from __future__ import division
 import re

 from lib.core.common import extractRegexResult
+from lib.core.common import extractStructuralTokens
 from lib.core.common import getFilteredPageContent
 from lib.core.common import jsonMinimize
 from lib.core.common import listToStrValue
@ -177,6 +178,15 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
                seq1 = jsonMinimize(kb.pageTemplate)
                seq2 = jsonMinimize(rawPage)

+            # Structure-aware comparison for a structurally-stable (but byte-unstable) HTML page:
+            # compare the value-free tag/class/id skeleton so dynamic text does not perturb the ratio
+            # while a structural change (e.g. a results table appearing/disappearing) still does
+            if seq1 is None and kb.pageStructurallyStable and not (conf.titles or conf.textOnly or kb.nullConnection):
+                _ = "\n".join(sorted(extractStructuralTokens(kb.pageTemplate)))
+                if _:   # only engage when the page actually exposes structure (HTML tags); tagless content falls back to text
+                    seq1 = _
+                    seq2 = "\n".join(sorted(extractStructuralTokens(rawPage)))
+
            if seq1 is None or seq2 is None:
                if conf.titles:
                    seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a)