diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 6d17467acf..dbeda89708 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -705,6 +705,48 @@ def build_detection_from_node( # as javadoc (r'^@[Cc]opyrights?:?$', 'COPY'), + ############################################################################ + # URLS and emails - moved here to prevent (c) in URLs from being matched as copyright + ############################################################################ + + # email start-at-end: : + (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'), + (r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'), + + # a .sh shell scripts is NOT an email. + (r'^.*\.sh\.?$', 'JUNK'), + # email eventually in parens or brackets with some trailing punct. Note the @ or "at " + (r'^(?:[A-Za-z])*[<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[>\)\.\,]*$', 'EMAIL'), + + # mailto URLs + (r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'), + + (r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'), + + # URLS such as <(http://fedorahosted.org/lohit)> or () + (r'[<\(]https?:.*[>\)]', 'URL'), + # URLS such as ibm.com without a scheme + (r'\s?[a-z0-9A-Z\-\.\\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'), + # TODO: add more extensions: there are so many TLDs these days! + # URL wrapped in () or <> + (r'[\(<]+\s?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'), + (r']?$', 'URL'), + # derived from regex in cluecode.finder + (r'\[\]"]+' + r'|(?:www|ftp)\.[^\s<>\[\]]+' + r')\.?>?', 'URL'), + + (r'^\(??$', 'URL'), + + # URLS with trailing/ such as http://fedorahosted.org/lohit/ + # URLS with leading( such as (http://qbnz.com/highlighter/ + (r'\(?https?:.*/', 'URL'), + + ############################################################################ + # Back to COPYRIGHT patterns + ############################################################################ + (r'^\(C\)\,?$', 'COPY'), (r'^\(c\)\,?$', 'COPY'), @@ -2258,44 +2300,6 @@ def build_detection_from_node( # this was capturing AbCdEf or a bare comma. (r'^([A-Z][a-z0-9]+){1,2}\.?,?$', 'NNP'), - ############################################################################ - # URLS and emails - ############################################################################ - - # email start-at-end: : - (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'), - (r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'), - - # a .sh shell scripts is NOT an email. - (r'^.*\.sh\.?$', 'JUNK'), - # email eventually in parens or brackets with some trailing punct. Note the @ or "at " - (r'^(?:[A-Za-z])*[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[\>\)\.\,]*$', 'EMAIL'), - - # mailto URLs - (r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'), - - (r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'), - - # URLS such as <(http://fedorahosted.org/lohit)> or () - (r'[<\(]https?:.*[>\)]', 'URL'), - # URLS such as ibm.com without a scheme - (r'\s?[a-z0-9A-Z\-\.\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'), - # TODO: add more extensions: there are so many TLDs these days! - # URL wrapped in () or <> - (r'[\(<]+\s?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'), - (r']?$', 'URL'), - # derived from regex in cluecode.finder - (r'\[\]"]+' - r'|(?:www|ftp)\.[^\s<>\[\]"]+' - r')\.?>?', 'URL'), - - (r'^\(??$', 'URL'), - - # URLS with trailing/ such as http://fedorahosted.org/lohit/ - # URLS with leading( such as (http://qbnz.com/highlighter/ - (r'\(?https?:.*/', 'URL'), - ############################################################################ # Misc ############################################################################ @@ -3783,6 +3787,10 @@ def refine_names(s, prefixes): r'^\(c\) \(c\) B$', r'^\(c\) group$', r'^\(c\) \(c\) A$', + # URLs with (c) in path or query - these are false positives + r'.*https?://', # contains http:// or https:// + r'.*/.*\(c\)', # has path-like structure with (c) + r'\(c\).*https?://', # (c) followed by URL ] # a collection of junk junk matcher callables diff --git a/tests/cluecode/data/copyrights/url_with_c_symbol.txt b/tests/cluecode/data/copyrights/url_with_c_symbol.txt new file mode 100644 index 0000000000..e77e91a470 --- /dev/null +++ b/tests/cluecode/data/copyrights/url_with_c_symbol.txt @@ -0,0 +1,3 @@ +http://biblio.cesga.es:81/search*gag/aXove,+Xosé/axove+xose/7,-1,0,B/frameset&F=axuntanza&1,,3 +http://example.com/path/(c)/test +http://test.org/query?param=(c)&other=value