From 24d43f1b8ec3cca5f4df93d56b46d7938024d23d Mon Sep 17 00:00:00 2001 From: Mac Date: Tue, 3 Feb 2026 23:36:49 +0530 Subject: [PATCH] Fix copyright detection for URLs containing (c) symbol Fixes #4724 URLs containing (c) in their path or query parameters were incorrectly detected as copyright statements. For example: http://example.com/path/(c)/test This fix addresses the issue by: 1. Reordering URL/email patterns to appear before (C) and (c) copyright patterns in the lexer, ensuring URL tokens are matched as URLs first 2. Adding junk copyright patterns to filter out false positives from URL fragments containing (c) The tokenizer splits URLs on = and ; characters, which can cause (c) to appear as a separate token. By prioritizing URL pattern matching and filtering URL-like detections, we prevent these false positives. Tested with the original urls.10K file from the issue - now shows 0 false positives (previously had 2). Signed-off-by: Gyan Ranjan Panda --- src/cluecode/copyrights.py | 84 ++++++++++--------- .../data/copyrights/url_with_c_symbol.txt | 3 + 2 files changed, 49 insertions(+), 38 deletions(-) create mode 100644 tests/cluecode/data/copyrights/url_with_c_symbol.txt diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 6d17467acf..dbeda89708 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -705,6 +705,48 @@ def build_detection_from_node( # as javadoc (r'^@[Cc]opyrights?:?$', 'COPY'), + ############################################################################ + # URLS and emails - moved here to prevent (c) in URLs from being matched as copyright + ############################################################################ + + # email start-at-end: : + (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'), + (r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'), + + # a .sh shell scripts is NOT an email. + (r'^.*\.sh\.?$', 'JUNK'), + # email eventually in parens or brackets with some trailing punct. Note the @ or "at " + (r'^(?:[A-Za-z])*[<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[>\)\.\,]*$', 'EMAIL'), + + # mailto URLs + (r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'), + + (r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'), + + # URLS such as <(http://fedorahosted.org/lohit)> or () + (r'[<\(]https?:.*[>\)]', 'URL'), + # URLS such as ibm.com without a scheme + (r'\s?[a-z0-9A-Z\-\.\\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'), + # TODO: add more extensions: there are so many TLDs these days! + # URL wrapped in () or <> + (r'[\(<]+\s?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'), + (r']?$', 'URL'), + # derived from regex in cluecode.finder + (r'\[\]"]+' + r'|(?:www|ftp)\.[^\s<>\[\]]+' + r')\.?>?', 'URL'), + + (r'^\(??$', 'URL'), + + # URLS with trailing/ such as http://fedorahosted.org/lohit/ + # URLS with leading( such as (http://qbnz.com/highlighter/ + (r'\(?https?:.*/', 'URL'), + + ############################################################################ + # Back to COPYRIGHT patterns + ############################################################################ + (r'^\(C\)\,?$', 'COPY'), (r'^\(c\)\,?$', 'COPY'), @@ -2258,44 +2300,6 @@ def build_detection_from_node( # this was capturing AbCdEf or a bare comma. (r'^([A-Z][a-z0-9]+){1,2}\.?,?$', 'NNP'), - ############################################################################ - # URLS and emails - ############################################################################ - - # email start-at-end: : - (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'), - (r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'), - - # a .sh shell scripts is NOT an email. - (r'^.*\.sh\.?$', 'JUNK'), - # email eventually in parens or brackets with some trailing punct. Note the @ or "at " - (r'^(?:[A-Za-z])*[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[\>\)\.\,]*$', 'EMAIL'), - - # mailto URLs - (r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'), - - (r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'), - - # URLS such as <(http://fedorahosted.org/lohit)> or () - (r'[<\(]https?:.*[>\)]', 'URL'), - # URLS such as ibm.com without a scheme - (r'\s?[a-z0-9A-Z\-\.\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'), - # TODO: add more extensions: there are so many TLDs these days! - # URL wrapped in () or <> - (r'[\(<]+\s?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'), - (r']?$', 'URL'), - # derived from regex in cluecode.finder - (r'\[\]"]+' - r'|(?:www|ftp)\.[^\s<>\[\]"]+' - r')\.?>?', 'URL'), - - (r'^\(??$', 'URL'), - - # URLS with trailing/ such as http://fedorahosted.org/lohit/ - # URLS with leading( such as (http://qbnz.com/highlighter/ - (r'\(?https?:.*/', 'URL'), - ############################################################################ # Misc ############################################################################ @@ -3783,6 +3787,10 @@ def refine_names(s, prefixes): r'^\(c\) \(c\) B$', r'^\(c\) group$', r'^\(c\) \(c\) A$', + # URLs with (c) in path or query - these are false positives + r'.*https?://', # contains http:// or https:// + r'.*/.*\(c\)', # has path-like structure with (c) + r'\(c\).*https?://', # (c) followed by URL ] # a collection of junk junk matcher callables diff --git a/tests/cluecode/data/copyrights/url_with_c_symbol.txt b/tests/cluecode/data/copyrights/url_with_c_symbol.txt new file mode 100644 index 0000000000..e77e91a470 --- /dev/null +++ b/tests/cluecode/data/copyrights/url_with_c_symbol.txt @@ -0,0 +1,3 @@ +http://biblio.cesga.es:81/search*gag/aXove,+Xosé/axove+xose/7,-1,0,B/frameset&F=axuntanza&1,,3 +http://example.com/path/(c)/test +http://test.org/query?param=(c)&other=value