Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 46 additions & 38 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,48 @@ def build_detection_from_node(
# as javadoc
(r'^@[Cc]opyrights?:?$', 'COPY'),

############################################################################
# URLS and emails - moved here to prevent (c) in URLs from being matched as copyright
############################################################################

# email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
(r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'),
(r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'),

# a .sh shell scripts is NOT an email.
(r'^.*\.sh\.?$', 'JUNK'),
# email eventually in parens or brackets with some trailing punct. Note the @ or "at "
(r'^(?:[A-Za-z])*[<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[>\)\.\,]*$', 'EMAIL'),

# mailto URLs
(r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'),

(r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'),

# URLS such as <(http://fedorahosted.org/lohit)> or ()
(r'[<\(]https?:.*[>\)]', 'URL'),
# URLS such as ibm.com without a scheme
(r'\s?[a-z0-9A-Z\-\.\\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'),
# TODO: add more extensions: there are so many TLDs these days!
# URL wrapped in () or <>
(r'[\(<]+\s?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'),
(r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$', 'URL'),
# derived from regex in cluecode.finder
(r'<?a?.(href)?('
r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
r'|(?:www|ftp)\.[^\s<>\[\]]+'
r')\.?>?', 'URL'),

(r'^\(?<?https?://[a-zA-Z0-9_\-]+(\\.([a-zA-Z0-9_\-])+)+.?\)?>?$', 'URL'),

# URLS with trailing/ such as http://fedorahosted.org/lohit/
# URLS with leading( such as (http://qbnz.com/highlighter/
(r'\(?https?:.*/', 'URL'),

############################################################################
# Back to COPYRIGHT patterns
############################################################################

(r'^\(C\)\,?$', 'COPY'),
(r'^\(c\)\,?$', 'COPY'),

Expand Down Expand Up @@ -2258,44 +2300,6 @@ def build_detection_from_node(
# this was capturing AbCdEf or a bare comma.
(r'^([A-Z][a-z0-9]+){1,2}\.?,?$', 'NNP'),

############################################################################
# URLS and emails
############################################################################

# email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
(r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'),
(r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'),

# a .sh shell scripts is NOT an email.
(r'^.*\.sh\.?$', 'JUNK'),
# email eventually in parens or brackets with some trailing punct. Note the @ or "at "
(r'^(?:[A-Za-z])*[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[\>\)\.\,]*$', 'EMAIL'),

# mailto URLs
(r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'),

(r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'),

# URLS such as <(http://fedorahosted.org/lohit)> or ()
(r'[<\(]https?:.*[>\)]', 'URL'),
# URLS such as ibm.com without a scheme
(r'\s?[a-z0-9A-Z\-\.\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'),
# TODO: add more extensions: there are so many TLDs these days!
# URL wrapped in () or <>
(r'[\(<]+\s?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'),
(r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$', 'URL'),
# derived from regex in cluecode.finder
(r'<?a?.(href)?.('
r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
r'|(?:www|ftp)\.[^\s<>\[\]"]+'
r')\.?>?', 'URL'),

(r'^\(?<?https?://[a-zA-Z0-9_\-]+(\.([a-zA-Z0-9_\-])+)+.?\)?>?$', 'URL'),

# URLS with trailing/ such as http://fedorahosted.org/lohit/
# URLS with leading( such as (http://qbnz.com/highlighter/
(r'\(?https?:.*/', 'URL'),

############################################################################
# Misc
############################################################################
Expand Down Expand Up @@ -3783,6 +3787,10 @@ def refine_names(s, prefixes):
r'^\(c\) \(c\) B$',
r'^\(c\) group$',
r'^\(c\) \(c\) A$',
# URLs with (c) in path or query - these are false positives
r'.*https?://', # contains http:// or https://
r'.*/.*\(c\)', # has path-like structure with (c)
r'\(c\).*https?://', # (c) followed by URL
]

# a collection of junk junk matcher callables
Expand Down
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/url_with_c_symbol.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
http://biblio.cesga.es:81/search*gag/aXove,+Xosé/axove+xose/7,-1,0,B/frameset&F=axuntanza&1,,3
http://example.com/path/(c)/test
http://test.org/query?param=(c)&other=value
Loading