Implementing extractStructuralTokens as a helper to detection engine
Some checks are pending
/ build (macos-latest, 3.8) (push) Waiting to run
/ build (ubuntu-latest, pypy-2.7) (push) Waiting to run
/ build (windows-latest, 3.14) (push) Waiting to run

This commit is contained in:
Miroslav Štampar 2026-06-30 23:09:06 +02:00
parent f932a3f30f
commit 74f90df8ae
6 changed files with 87 additions and 7 deletions

View file

@ -16,6 +16,7 @@ from extra.beep.beep import beep
from lib.core.agent import agent
from lib.core.common import Backend
from lib.core.common import extractRegexResult
from lib.core.common import extractStructuralTokens
from lib.core.common import extractTextTagContent
from lib.core.common import filterNone
from lib.core.common import findDynamicContent
@ -1390,7 +1391,26 @@ def checkStability():
raise SqlmapNoneDataException(errMsg)
else:
checkDynamicContent(firstPage, secondPage)
# Before engaging the (lossy) dynamic-content removal / '--text-only' escalation, check
# whether the page is structurally stable (identical tag/class/id skeleton across the two
# requests) despite differing text. If so, base the comparison on that value-free structure
# so that dynamic content (e.g. per-render result rows) does not mask an injection. This is
# the HTML counterpart of the structure-aware JSON comparison
if firstPage and secondPage and extractStructuralTokens(firstPage) == extractStructuralTokens(secondPage):
kb.pageStructurallyStable = True
if kb.nullConnection:
debugMsg = "turning off NULL connection "
debugMsg += "support because of structural page comparison"
logger.debug(debugMsg)
kb.nullConnection = None
infoMsg = "target URL content is not byte-stable but structurally stable; sqlmap "
infoMsg += "will base the page comparison on the page structure"
logger.info(infoMsg)
else:
checkDynamicContent(firstPage, secondPage)
return kb.pageStable

View file

@ -176,6 +176,9 @@ from lib.core.settings import REPLACEMENT_MARKER
from lib.core.settings import SENSITIVE_DATA_REGEX
from lib.core.settings import SENSITIVE_OPTIONS
from lib.core.settings import STDIN_PIPE_DASH
from lib.core.settings import STRUCTURAL_CLASS_REGEX
from lib.core.settings import STRUCTURAL_ID_REGEX
from lib.core.settings import STRUCTURAL_TAG_REGEX
from lib.core.settings import SUPPORTED_DBMS
from lib.core.settings import TEXT_TAG_REGEX
from lib.core.settings import TIME_STDEV_COEFF
@ -3227,6 +3230,45 @@ def extractTextTagContent(page):
return filterNone(_.group("result").strip() for _ in re.finditer(TEXT_TAG_REGEX, page))
def extractStructuralTokens(page):
"""
Returns a set of value-free structural tokens (tag names and class/id attribute hooks) of a
(HTML) page, discarding all textual content. Used for structure-aware page comparison when the
page is byte-unstable but structurally stable (e.g. dynamic result rows in a fixed layout), so
that dynamic text does not perturb the comparison while a structural change (e.g. a results
table appearing or disappearing) still does. HTML counterpart of jsonMinimize()
>>> sorted(extractStructuralTokens(u'<div id="g" class="a b"><span>x</span></div>')) == [u'cls:div.a', u'cls:div.b', u'id:div#g', u'tag:div', u'tag:span']
True
>>> extractStructuralTokens(u'<table><tr><td>1</td></tr></table>') == set([u'tag:table', u'tag:tr', u'tag:td'])
True
>>> extractStructuralTokens(u'') == set()
True
"""
page = page or ""
if REFLECTED_VALUE_MARKER in page:
page = re.sub(r"(?i)<[^>]*%s[^>]*>" % REFLECTED_VALUE_MARKER, " ", page)
page = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>", " ", page)
retVal = set()
for match in re.finditer(STRUCTURAL_TAG_REGEX, page):
tag = match.group(1).lower()
attrs = match.group(2) or ""
retVal.add("tag:%s" % tag)
for _ in re.finditer(STRUCTURAL_CLASS_REGEX, attrs):
for value in (_.group(1) or _.group(2) or _.group(3) or "").split():
retVal.add("cls:%s.%s" % (tag, value))
for _ in re.finditer(STRUCTURAL_ID_REGEX, attrs):
value = (_.group(1) or _.group(2) or _.group(3) or "").strip()
if value:
retVal.add("id:%s#%s" % (tag, value))
return retVal
def trimAlphaNum(value):
"""
Trims alpha numeric characters from start and ending of a given value

View file

@ -2210,6 +2210,7 @@ def _setKnowledgeBaseAttributes(flushAll=True):
kb.pageTemplates = dict()
kb.pageEncoding = DEFAULT_PAGE_ENCODING
kb.pageStable = None
kb.pageStructurallyStable = None
kb.partRun = None
kb.permissionFlag = False
kb.place = None

View file

@ -20,7 +20,7 @@ from lib.core.enums import OS
from thirdparty import six
# sqlmap version (<major>.<minor>.<month>.<monthly commit>)
VERSION = "1.10.6.198"
VERSION = "1.10.6.199"
TYPE = "dev" if VERSION.count('.') > 2 and VERSION.split('.')[-1] != '0' else "stable"
TYPE_COLORS = {"dev": 33, "stable": 90, "pip": 34}
VERSION_STRING = "sqlmap/%s#%s" % ('.'.join(VERSION.split('.')[:-1]) if VERSION.count('.') > 2 and VERSION.split('.')[-1] == '0' else VERSION, TYPE)
@ -180,6 +180,13 @@ DUMMY_SEARCH_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100
# Regular expression used for extracting content from "textual" tags
TEXT_TAG_REGEX = r"(?si)<(abbr|acronym|b|blockquote|br|center|cite|code|dt|em|font|h[1-6]|i|li|p|pre|q|strong|sub|sup|td|th|title|tt|u)(?!\w).*?>(?P<result>[^<]+)"
# Regular expressions used for extracting a value-free structural skeleton of a (HTML) page (tag
# names and class/id attribute hooks), for structure-aware comparison of pages whose textual
# content is dynamic but whose layout is stable
STRUCTURAL_TAG_REGEX = r"(?si)<\s*([a-z][a-z0-9]*)((?:\s+[^<>]*)?)/?>"
STRUCTURAL_CLASS_REGEX = r"""(?si)\bclass\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
STRUCTURAL_ID_REGEX = r"""(?si)\bid\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
# Regular expression used for recognition of IP addresses
IP_ADDRESS_REGEX = r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b"

View file

@ -10,6 +10,7 @@ from __future__ import division
import re
from lib.core.common import extractRegexResult
from lib.core.common import extractStructuralTokens
from lib.core.common import getFilteredPageContent
from lib.core.common import jsonMinimize
from lib.core.common import listToStrValue
@ -177,6 +178,15 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
seq1 = jsonMinimize(kb.pageTemplate)
seq2 = jsonMinimize(rawPage)
# Structure-aware comparison for a structurally-stable (but byte-unstable) HTML page:
# compare the value-free tag/class/id skeleton so dynamic text does not perturb the ratio
# while a structural change (e.g. a results table appearing/disappearing) still does
if seq1 is None and kb.pageStructurallyStable and not (conf.titles or conf.textOnly or kb.nullConnection):
_ = "\n".join(sorted(extractStructuralTokens(kb.pageTemplate)))
if _: # only engage when the page actually exposes structure (HTML tags); tagless content falls back to text
seq1 = _
seq2 = "\n".join(sorted(extractStructuralTokens(rawPage)))
if seq1 is None or seq2 is None:
if conf.titles:
seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a)