@@ -99,10 +99,11 @@ class pattern(PatternMethods):
9999 str_pattern : str
100100 bin_pattern : bytes
101101
102- def __init__ (self , pattern : str , flags : int = 0 ):
102+ def __init__ (self , pattern : str , flags : int = 0 , description : str = '' ):
103103 self .str_pattern = pattern
104104 self .bin_pattern = pattern .encode ('ascii' )
105105 self .flags = flags
106+ self .description = description
106107
107108 def __bytes__ (self ):
108109 return self .bin_pattern
@@ -243,8 +244,27 @@ def __getattr__(self, name):
243244 raise AttributeError
244245
245246 @property
246- def display (self ):
247- return normalize_to_display (self .name )
247+ def description (self ):
248+ return self .value .description
249+
250+ @classmethod
251+ def make_table_with_shorts (cls , name : str ):
252+ alias = {p .name : name for name , p in cls .__members__ .items () if name != p .name }
253+ for p in cls :
254+ if p .name .endswith ('array' ):
255+ alias [p .name ] = F'[{ p .name [:- 5 ]} ]'
256+ width = max (len (p .name ) for p in cls ) + 4
257+ table = [
258+ (name .upper (), 'SHORT' , 'DESCRIPTION' ), * (
259+ (p .name , alias .get (p .name , '' ), p .description ) for p in cls )]
260+ return '\n ' .join ((
261+ F'{ "" :>8} { n :>{width }} { s :<5} { d } ' for n , s , d in table ))
262+
263+ @classmethod
264+ def make_table (cls , name : str ):
265+ width = max (len (p .name ) for p in cls ) + 4
266+ table = [(name .upper (), 'DESCRIPTION' ), * ((p .name , p .description ) for p in cls )]
267+ return '\n ' .join ((F'{ "" :>8} { n :>{width }} { d } ' for n , d in table ))
248268
249269
250270_TLDS = R'(?i:{possible_tld})(?!(?:{dealbreakers}))' .format (
@@ -500,108 +520,114 @@ def make_hexline_pattern(blocksize: int) -> str:
500520
501521
502522class checks (_PatternEnum ):
503- json = pattern (_pattern_json )
504- "Data that consists of JSON-like tokens; cannot detect actual JSON data."
505- path_element_nospace = pattern (_pattern_pathpart_nospace )
506- "A string that can be a valid file system path component and contains no spaces."
523+ json = pattern (_pattern_json ,
524+ description = "Data that consists of JSON-like tokens; cannot detect actual JSON data." )
525+ path_element_nospace = pattern (_pattern_pathpart_nospace ,
526+ description = "A string that can be a valid file system path component and contains no spaces." )
507527
508528
509529class formats (_PatternEnum ):
510530 """
511531 An enumeration of patterns for certain formats.
512532 """
513- int = pattern (_pattern_integer )
514- "Integer expressions"
515- flt = pattern (_pattern_float )
516- "Floating point number expressions"
517- num = pattern (_pattern_number )
518- "Either an integer or a float"
519- str = pattern (_pattern_string )
520- "C syntax string literal"
521- cmdstr = pattern (_pattern_cmdstr )
522- "Windows command line escaped string literal"
523- ps1str = pattern (_pattern_ps1str , flags = re .DOTALL )
524- "PowerShell escaped string literal"
525- vbastr = pattern (_pattern_vbastr )
526- "VBS/VBA string literal"
527- vbaint = pattern (_pattern_vbaint )
528- "VBS/VBA integer literal"
529- printable = alphabet (R'[\s!-~]' )
530- "Any sequence of printable characters"
531- urlquote = pattern (_pattern_urlenc )
532- "Any sequence of url-encoded characters, default char set"
533- urlhex = pattern (_pattern_urlhex )
534- "A hex-encoded buffer using URL escape sequences"
535- htmlesc = pattern (_pattern_htmlesc )
536- "A sequence of HTML-escape characters"
537- intarray = tokenize (_pattern_integer , sep = R'[;,]' , bound = '' , unique_sep = True )
538- "Sequences of integers, separated by commas or semicolons"
539- strarray = tokenize (_pattern_string , sep = R'[;,]' , bound = '' , unique_sep = True )
540- "Sequences of strings, separated by commas or semicolons"
541- numarray = tokenize (_pattern_number , sep = R'[;,]' , bound = '' , unique_sep = True )
542- "Sequences of numbers, separated by commas or semicolons"
543- hexarray = tokenize (R'[0-9A-Fa-f]{2}' , sep = R'[;,]' , bound = '' , unique_sep = True )
544- "Arrays of hexadecimal strings, separated by commas or semicolons"
545- word = alphabet (R'\\w' )
546- "Sequences of word characters"
547- letters = alphabet (R'[a-zA-Z]' )
548- "Sequences of alphabetic characters"
549- wshenc = pattern (_pattern_wshenc )
550- "Encoded Windows Scripting Host Scripts (JS/VBS)"
551- alnum = alphabet (R'[a-zA-Z0-9]' )
552- "Sequences of alpha-numeric characters"
553- b32 = pattern ('[A-Z2-7]+|[a-z2-7+]' )
554- "Base32 encoded strings"
555- b58 = alphabet (R'(?:[1-9A-HJ-NP-Za-km-z]' )
556- "Base58 encoded strings"
557- b62 = alphabet (R'(?:[0-9A-Za-z]' )
558- "Base62 encoded strings"
559- b64 = alphabet (R'(?:[0-9a-zA-Z\+/]{4})' , suffix = R'(?:(?:[0-9a-zA-Z\+/]{2,3})={0,3})?' , suffix_max = 6 , token_size = 4 )
560- "Base64 encoded strings"
561- b85 = alphabet (R'[-!+*()#-&^-~0-9;-Z]' )
562- "Base85 encoded strings"
563- a85 = alphabet (R'[!-u]' )
564- "Ascii85 encoded strings"
565- z85 = alphabet (R'[-0-9a-zA-Z.:+=^!/*?&<>()\[\]{}@%$#]' )
566- "Z85 encoded strings"
567- b92 = pattern (_pattern_b92 )
568- "Base92 encoded strings"
569- b64u = alphabet (R'[-\w]{4}' , suffix = R'(?:[-\w]{2,3}={0,3})?' , suffix_max = 6 )
570- "Base64 encoded strings using URL-safe alphabet"
571- hex = alphabet (R'[0-9a-fA-F]{2}' , token_size = 2 )
572- "Hexadecimal strings"
573- b16 = alphabet (R'[0-9A-F]{2}' , token_size = 2 )
574- "Uppercase hexadecimal strings"
575- b16s = tokenize (R'[0-9a-fA-F]+' , R'\s*' , bound = '' )
576- "Hexadecimal strings"
577- b64s = alphabet (R'[-\s\w\+/]' , suffix = R'(?:={0,3})?' , suffix_max = 3 )
578- "Base64 encoded strings, separated by whitespace"
579- b85s = alphabet (R'[-!+*()#-&^-~0-9;-Z\s]' )
580- "Base85 encoded string, separated by whitespace"
581- a85s = alphabet (R'[!-u\s]' )
582- "Ascii85 encoded string, separated by whitespace"
583- z85s = alphabet (R'[-\s0-9a-zA-Z.:+=^!/*?&<>()\[\]{}@%$#]' )
584- "Z85 encoded string, separated by whitespace"
585- utf8 = pattern (_pattern_utf8 )
586- "A sequence of bytes that can be decoded as UTF8."
587- hexdump = tokenize (_pattern_hexline , bound = '' , sep = R'\s*\n' )
588- """
589- This pattern matches a typical hexdump output where hexadecimally encoded
590- bytes are followed by a string which contains dots or printable characters
591- from the dump. For example:
592-
593- 46 4F 4F 0A 42 41 52 0A FOO.BAR.
594- F0 0B AA BA F0 0B ......
595- """
596- uuenc = pattern (_pattern_uuencode )
597- "UUEncoded data"
533+ integer = pattern (_pattern_integer ,
534+ description = "any integer literal expression" )
535+ float = pattern (_pattern_float ,
536+ description = "floating point literals" )
537+ number = pattern (_pattern_number ,
538+ description = "either an integer or a float" )
539+ string = pattern (_pattern_string ,
540+ description = "c-syntax string literal" )
541+ cmdstr = pattern (_pattern_cmdstr ,
542+ description = "Windows command line escaped string literal" )
543+ ps1str = pattern (_pattern_ps1str , flags = re .DOTALL ,
544+ description = "PowerShell escaped string literal" )
545+ vbastr = pattern (_pattern_vbastr ,
546+ description = "VBS/VBA string literal" )
547+ vbaint = pattern (_pattern_vbaint ,
548+ description = "VBS/VBA integer literal" )
549+ printable = alphabet (R'[\s!-~]' ,
550+ description = "printable strings (includes whitespace)" )
551+ urlquote = pattern (_pattern_urlenc ,
552+ description = "url-encoded characters, default char set" )
553+ urlhex = pattern (_pattern_urlhex ,
554+ description = "hex-encoded buffer using URL escape sequences" )
555+ htmlesc = pattern (_pattern_htmlesc ,
556+ description = "sequence of HTML-escape characters" )
557+ intarray = tokenize (_pattern_integer , sep = R'[;,]' , bound = '' , unique_sep = True ,
558+ description = "integers separated by commas or semicolons" )
559+ strarray = tokenize (_pattern_string , sep = R'[;,]' , bound = '' , unique_sep = True ,
560+ description = "strings separated by commas or semicolons" )
561+ numarray = tokenize (_pattern_number , sep = R'[;,]' , bound = '' , unique_sep = True ,
562+ description = "numbers separated by commas or semicolons" )
563+ hexarray = tokenize (R'[0-9A-Fa-f]{2}' , sep = R'[;,]' , bound = '' , unique_sep = True ,
564+ description = "hex sequences separated by commas or semicolons" )
565+ letters = alphabet (R'[a-zA-Z]' ,
566+ description = "alphabetic characters" )
567+ wshenc = pattern (_pattern_wshenc ,
568+ description = "encoded Windows Scripting Host Scripts (JS/VBS)" )
569+ alnum = alphabet (R'[a-zA-Z0-9]' ,
570+ description = "alphanumeric characters" )
571+ base32 = pattern ('[A-Z2-7]+|[a-z2-7+]' ,
572+ description = "Base32 encoded strings" )
573+ base58 = alphabet (R'(?:[1-9A-HJ-NP-Za-km-z]' ,
574+ description = "Base58 encoded strings" )
575+ base62 = alphabet (R'(?:[0-9A-Za-z]' ,
576+ description = "Base62 encoded strings" )
577+ base64 = alphabet (R'(?:[0-9a-zA-Z\+/]{4})' , suffix = R'(?:(?:[0-9a-zA-Z\+/]{2,3})={0,3})?' , suffix_max = 6 , token_size = 4 ,
578+ description = "Base64 encoded strings" )
579+ base85 = alphabet (R'[-!+*()#-&^-~0-9;-Z]' ,
580+ description = "Base85 encoded strings" )
581+ ascii85 = alphabet (R'[!-u]' ,
582+ description = "Ascii85 encoded strings" )
583+ z85 = alphabet (R'[-0-9a-zA-Z.:+=^!/*?&<>()\[\]{}@%$#]' ,
584+ description = "Z85 encoded strings" )
585+ base92 = pattern (_pattern_b92 ,
586+ description = "Base92 encoded strings" )
587+ base64u = alphabet (R'[-\w]{4}' , suffix = R'(?:[-\w]{2,3}={0,3})?' , suffix_max = 6 ,
588+ description = "Base64 encoded strings using URL-safe alphabet" )
589+ hex = alphabet (R'[0-9a-fA-F]{2}' , token_size = 2 ,
590+ description = "hexadecimal strings" )
591+ base16 = alphabet (R'[0-9A-F]{2}' , token_size = 2 ,
592+ description = "uppercase hexadecimal strings" )
593+ base16s = tokenize (R'[0-9a-fA-F]+' , R'\s*' , bound = '' ,
594+ description = "hexadecimal strings" )
595+ base64s = alphabet (R'[-\s\w\+/]' , suffix = R'(?:={0,3})?' , suffix_max = 3 ,
596+ description = "Base64 encoded strings, separated by whitespace" )
597+ base85s = alphabet (R'[-!+*()#-&^-~0-9;-Z\s]' ,
598+ description = "Base85 encoded string, separated by whitespace" )
599+ a85s = alphabet (R'[!-u\s]' ,
600+ description = "Ascii85 encoded string, separated by whitespace" )
601+ z85s = alphabet (R'[-\s0-9a-zA-Z.:+=^!/*?&<>()\[\]{}@%$#]' ,
602+ description = "Z85 encoded string, separated by whitespace" )
603+ utf8 = pattern (_pattern_utf8 ,
604+ description = "sequences of bytes that can be decoded as UTF8" )
605+ hexdump = tokenize (_pattern_hexline , bound = '' , sep = R'\s*\n' ,
606+ description = "typical hexdump output" )
607+ uuenc = pattern (_pattern_uuencode ,
608+ description = "UUEncoded data" )
598609
599610 # shortcuts
600- float = flt
601- integer = int
602- number = num
603- string = str
611+ flt = float
612+ int = integer
613+ num = number
614+ str = string
615+ b32 = base32
616+ b58 = base58
617+ b62 = base62
618+ b64 = base64
619+ b85 = base85
620+ b92 = base92
621+ a85 = ascii85
622+ b16 = base16
623+ b64u = base64u
624+ b16s = base16s
625+ b64s = base64s
626+ b85s = base85s
604627 ps = printable
628+ hd = hexdump
629+ uq = urlquote
630+ uh = urlhex
605631
606632 @classmethod
607633 def from_dashname (cls , key : str ):
@@ -646,48 +672,48 @@ class indicators(_PatternEnum):
646672 """
647673 An enumeration of patterns for indicators.
648674 """
649- domain = pattern (_pattern_serrated_domain )
650- "Domain names"
651- email = pattern (_pattern_email )
652- "Email addresses"
653- guid = pattern (_pattern_guid )
654- "Windows GUID strings"
655- date = pattern (_pattern_date )
656- "A date or timestamp value in a common format"
657- ipv4 = pattern (_pattern_serrated_ipv4 )
658- "String representations of IPv4 addresses"
659- ipv6 = pattern (_pattern_ipv6 )
660- "String representations of IPv6 addresses"
661- md5 = alphabet ( '[0-9A-Fa-f]' , lower = 32 , upper = 32 )
662- "Hexadecimal strings of length 32"
663- sha1 = alphabet ( '[0-9A-Fa-f]' , lower = 40 , upper = 40 )
664- "Hexadecimal strings of length 40"
665- sha256 = alphabet ( '[0-9A-Fa-f]' , lower = 64 , upper = 64 )
666- "Hexadecimal strings of length 64"
667- host = pattern ( _pattern_serrated_host )
668- "Any domain name or IPv4 address, optionally followed by a colon and a port number."
669- socket = pattern ( _pattern_serrated_socket )
670- "Any domain name or IPv4 address followed by a colon and a (port) number"
671- subdomain = pattern ( _pattern_subdomain )
672- "A domain which contains at least three parts, including the top level"
673- url = pattern (_pattern_serrated_url )
674- "Uniform resource locator addresses"
675- pem = pattern (_pattern_pem )
676- "A pattern matching PEM encoded cryptographic parameters"
677- path = pattern (_pattern_any_path )
678- "Windows and Linux path names"
679- winpath = pattern (_pattern_win_path )
680- "Windows path names"
681- nixpath = pattern (_pattern_nix_path )
682- "Posix path names"
683- fpath = pattern (_pattern_any_path_terse )
684- "Terser pattern for Windows and Linux path names"
685- winfpath = pattern (_pattern_win_path_terse )
686- "Terser pattern for Windows path names"
687- nixfpath = pattern (_pattern_nix_path_terse )
688- "Terser pattern for Posix path names"
689- evar = pattern (_pattern_win_env_variable )
690- "Windows environment variables , i.e. something like ` %APPDATA%`"
675+ date = pattern (_pattern_date ,
676+ description = "date or timestamp value in a common format" )
677+ domain = pattern (_pattern_serrated_domain ,
678+ description = "domain names" )
679+ email = pattern (_pattern_email ,
680+ description = "email addresses" )
681+ guid = pattern (_pattern_guid ,
682+ description = "Windows GUID" )
683+ ipv4 = pattern (_pattern_serrated_ipv4 ,
684+ description = " IPv4 address string" )
685+ ipv6 = pattern (_pattern_ipv6 ,
686+ description = " IPv6 address string" )
687+ host = pattern ( _pattern_serrated_host ,
688+ description = "domain or IPv4 optionally followed by colon and port" )
689+ socket = pattern ( _pattern_serrated_socket ,
690+ description = "domain or IPv4 followed by colon and port number" )
691+ url = pattern ( _pattern_serrated_url ,
692+ description = "uniform resource locator addresses" )
693+ md5 = alphabet ( '[0-9A-Fa-f]' , lower = 32 , upper = 32 ,
694+ description = "hex strings of length 32" )
695+ sha1 = alphabet ( '[0-9A-Fa-f]' , lower = 40 , upper = 40 ,
696+ description = "hex strings of length 40" )
697+ sha256 = alphabet ( '[0-9A-Fa-f]' , lower = 64 , upper = 64 ,
698+ description = "hex strings of length 64" )
699+ subdomain = pattern (_pattern_subdomain ,
700+ description = "domain containing at least three parts including TLD" )
701+ pem = pattern (_pattern_pem ,
702+ description = " PEM encoded cryptographic parameters")
703+ path = pattern (_pattern_any_path ,
704+ description = "Windows and Linux file paths" )
705+ winpath = pattern (_pattern_win_path ,
706+ description = "file paths (Windows)" )
707+ nixpath = pattern (_pattern_nix_path ,
708+ description = "file paths (Linux)" )
709+ tpath = pattern (_pattern_any_path_terse ,
710+ description = "terser pattern for file paths" )
711+ wintpath = pattern (_pattern_win_path_terse ,
712+ description = "terser file path pattern (Windows)" )
713+ nixtpath = pattern (_pattern_nix_path_terse ,
714+ description = "terser file path pattern (Linux)" )
715+ evar = pattern (_pattern_win_env_variable ,
716+ description = "Windows environment variable , i.e. %APPDATA%" )
691717
692718 hostname = host
693719
0 commit comments