mirror of
https://github.com/sqlmapproject/sqlmap.git
synced 2026-07-01 22:11:07 +00:00
Implementing extractStructuralTokens as a helper to detection engine
This commit is contained in:
parent
f932a3f30f
commit
74f90df8ae
6 changed files with 87 additions and 7 deletions
|
|
@ -176,6 +176,9 @@ from lib.core.settings import REPLACEMENT_MARKER
|
|||
from lib.core.settings import SENSITIVE_DATA_REGEX
|
||||
from lib.core.settings import SENSITIVE_OPTIONS
|
||||
from lib.core.settings import STDIN_PIPE_DASH
|
||||
from lib.core.settings import STRUCTURAL_CLASS_REGEX
|
||||
from lib.core.settings import STRUCTURAL_ID_REGEX
|
||||
from lib.core.settings import STRUCTURAL_TAG_REGEX
|
||||
from lib.core.settings import SUPPORTED_DBMS
|
||||
from lib.core.settings import TEXT_TAG_REGEX
|
||||
from lib.core.settings import TIME_STDEV_COEFF
|
||||
|
|
@ -3227,6 +3230,45 @@ def extractTextTagContent(page):
|
|||
|
||||
return filterNone(_.group("result").strip() for _ in re.finditer(TEXT_TAG_REGEX, page))
|
||||
|
||||
def extractStructuralTokens(page):
|
||||
"""
|
||||
Returns a set of value-free structural tokens (tag names and class/id attribute hooks) of a
|
||||
(HTML) page, discarding all textual content. Used for structure-aware page comparison when the
|
||||
page is byte-unstable but structurally stable (e.g. dynamic result rows in a fixed layout), so
|
||||
that dynamic text does not perturb the comparison while a structural change (e.g. a results
|
||||
table appearing or disappearing) still does. HTML counterpart of jsonMinimize()
|
||||
|
||||
>>> sorted(extractStructuralTokens(u'<div id="g" class="a b"><span>x</span></div>')) == [u'cls:div.a', u'cls:div.b', u'id:div#g', u'tag:div', u'tag:span']
|
||||
True
|
||||
>>> extractStructuralTokens(u'<table><tr><td>1</td></tr></table>') == set([u'tag:table', u'tag:tr', u'tag:td'])
|
||||
True
|
||||
>>> extractStructuralTokens(u'') == set()
|
||||
True
|
||||
"""
|
||||
|
||||
page = page or ""
|
||||
|
||||
if REFLECTED_VALUE_MARKER in page:
|
||||
page = re.sub(r"(?i)<[^>]*%s[^>]*>" % REFLECTED_VALUE_MARKER, " ", page)
|
||||
|
||||
page = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>", " ", page)
|
||||
|
||||
retVal = set()
|
||||
|
||||
for match in re.finditer(STRUCTURAL_TAG_REGEX, page):
|
||||
tag = match.group(1).lower()
|
||||
attrs = match.group(2) or ""
|
||||
retVal.add("tag:%s" % tag)
|
||||
for _ in re.finditer(STRUCTURAL_CLASS_REGEX, attrs):
|
||||
for value in (_.group(1) or _.group(2) or _.group(3) or "").split():
|
||||
retVal.add("cls:%s.%s" % (tag, value))
|
||||
for _ in re.finditer(STRUCTURAL_ID_REGEX, attrs):
|
||||
value = (_.group(1) or _.group(2) or _.group(3) or "").strip()
|
||||
if value:
|
||||
retVal.add("id:%s#%s" % (tag, value))
|
||||
|
||||
return retVal
|
||||
|
||||
def trimAlphaNum(value):
|
||||
"""
|
||||
Trims alpha numeric characters from start and ending of a given value
|
||||
|
|
|
|||
|
|
@ -2210,6 +2210,7 @@ def _setKnowledgeBaseAttributes(flushAll=True):
|
|||
kb.pageTemplates = dict()
|
||||
kb.pageEncoding = DEFAULT_PAGE_ENCODING
|
||||
kb.pageStable = None
|
||||
kb.pageStructurallyStable = None
|
||||
kb.partRun = None
|
||||
kb.permissionFlag = False
|
||||
kb.place = None
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ from lib.core.enums import OS
|
|||
from thirdparty import six
|
||||
|
||||
# sqlmap version (<major>.<minor>.<month>.<monthly commit>)
|
||||
VERSION = "1.10.6.198"
|
||||
VERSION = "1.10.6.199"
|
||||
TYPE = "dev" if VERSION.count('.') > 2 and VERSION.split('.')[-1] != '0' else "stable"
|
||||
TYPE_COLORS = {"dev": 33, "stable": 90, "pip": 34}
|
||||
VERSION_STRING = "sqlmap/%s#%s" % ('.'.join(VERSION.split('.')[:-1]) if VERSION.count('.') > 2 and VERSION.split('.')[-1] == '0' else VERSION, TYPE)
|
||||
|
|
@ -180,6 +180,13 @@ DUMMY_SEARCH_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:141.0) Gecko/20100
|
|||
# Regular expression used for extracting content from "textual" tags
|
||||
TEXT_TAG_REGEX = r"(?si)<(abbr|acronym|b|blockquote|br|center|cite|code|dt|em|font|h[1-6]|i|li|p|pre|q|strong|sub|sup|td|th|title|tt|u)(?!\w).*?>(?P<result>[^<]+)"
|
||||
|
||||
# Regular expressions used for extracting a value-free structural skeleton of a (HTML) page (tag
|
||||
# names and class/id attribute hooks), for structure-aware comparison of pages whose textual
|
||||
# content is dynamic but whose layout is stable
|
||||
STRUCTURAL_TAG_REGEX = r"(?si)<\s*([a-z][a-z0-9]*)((?:\s+[^<>]*)?)/?>"
|
||||
STRUCTURAL_CLASS_REGEX = r"""(?si)\bclass\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
|
||||
STRUCTURAL_ID_REGEX = r"""(?si)\bid\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
|
||||
|
||||
# Regular expression used for recognition of IP addresses
|
||||
IP_ADDRESS_REGEX = r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue