From 24d43f1b8ec3cca5f4df93d56b46d7938024d23d Mon Sep 17 00:00:00 2001
From: Mac <mac@Macs-MacBook-Pro.local>
Date: Tue, 3 Feb 2026 23:36:49 +0530
Subject: [PATCH] Fix copyright detection for URLs containing (c) symbol

Fixes #4724

URLs containing (c) in their path or query parameters were incorrectly
detected as copyright statements. For example:
http://example.com/path/(c)/test

This fix addresses the issue by:
1. Reordering URL/email patterns to appear before (C) and (c) copyright
   patterns in the lexer, ensuring URL tokens are matched as URLs first
2. Adding junk copyright patterns to filter out false positives from
   URL fragments containing (c)

The tokenizer splits URLs on = and ; characters, which can cause (c)
to appear as a separate token. By prioritizing URL pattern matching
and filtering URL-like detections, we prevent these false positives.

Tested with the original urls.10K file from the issue - now shows 0
false positives (previously had 2).

Signed-off-by: Gyan Ranjan Panda <gyanranjanpanda@gmail.com>
---
 src/cluecode/copyrights.py                    | 84 ++++++++++---------
 .../data/copyrights/url_with_c_symbol.txt     |  3 +
 2 files changed, 49 insertions(+), 38 deletions(-)
 create mode 100644 tests/cluecode/data/copyrights/url_with_c_symbol.txt
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
index 6d17467acf..dbeda89708 100644
--- a/src/cluecode/copyrights.py
+++ b/src/cluecode/copyrights.py
@@ -705,6 +705,48 @@ def build_detection_from_node(
     # as javadoc
     (r'^@[Cc]opyrights?:?$', 'COPY'),
 
+    ############################################################################
+    # URLS and emails - moved here to prevent (c) in URLs from being matched as copyright
+    ############################################################################
+
+     # email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
+    (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'),
+    (r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'),
+
+    # a .sh shell scripts is NOT an email.
+    (r'^.*\.sh\.?$', 'JUNK'),
+    # email eventually in parens or brackets with some trailing punct. Note the @ or "at "
+    (r'^(?:[A-Za-z])*[<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[>\)\.\,]*$', 'EMAIL'),
+
+    # mailto URLs
+    (r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'),
+
+    (r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'),
+
+    # URLS such as <(http://fedorahosted.org/lohit)> or ()
+    (r'[<\(]https?:.*[>\)]', 'URL'),
+    # URLS such as ibm.com without a scheme
+    (r'\s?[a-z0-9A-Z\-\.\\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'),
+    # TODO: add more extensions: there are so many TLDs these days!
+    # URL wrapped in () or <>
+    (r'[\(<]+\s?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'),
+    (r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$', 'URL'),
+    # derived from regex in cluecode.finder
+    (r'<?a?.(href)?('
+     r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
+     r'|(?:www|ftp)\.[^\s<>\[\]]+'
+     r')\.?>?', 'URL'),
+
+    (r'^\(?<?https?://[a-zA-Z0-9_\-]+(\\.([a-zA-Z0-9_\-])+)+.?\)?>?$', 'URL'),
+
+    # URLS with trailing/ such as http://fedorahosted.org/lohit/
+    # URLS with leading( such as (http://qbnz.com/highlighter/
+    (r'\(?https?:.*/', 'URL'),
+
+    ############################################################################
+    # Back to COPYRIGHT patterns
+    ############################################################################
+
     (r'^\(C\)\,?$', 'COPY'),
     (r'^\(c\)\,?$', 'COPY'),
 
@@ -2258,44 +2300,6 @@ def build_detection_from_node(
     # this was capturing AbCdEf or a bare comma.
     (r'^([A-Z][a-z0-9]+){1,2}\.?,?$', 'NNP'),
 
-    ############################################################################
-    # URLS and emails
-    ############################################################################
-
-     # email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
-    (r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'),
-    (r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'),
-
-    # a .sh shell scripts is NOT an email.
-    (r'^.*\.sh\.?$', 'JUNK'),
-    # email eventually in parens or brackets with some trailing punct. Note the @ or "at "
-    (r'^(?:[A-Za-z])*[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,3}[\>\)\.\,]*$', 'EMAIL'),
-
-    # mailto URLs
-    (r'^mailto:.{2,}@.{2,}\.[a-z]{2,3}', 'EMAIL'),
-
-    (r'^<[a-zA-Z]+[a-zA-Z0-9\.]+@[a-zA-Z][a-zA-Z0-9]+\.[a-zA-Z]{2,5}>$', 'EMAIL'),
-
-    # URLS such as <(http://fedorahosted.org/lohit)> or ()
-    (r'[<\(]https?:.*[>\)]', 'URL'),
-    # URLS such as ibm.com without a scheme
-    (r'\s?[a-z0-9A-Z\-\.\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'),
-    # TODO: add more extensions: there are so many TLDs these days!
-    # URL wrapped in () or <>
-    (r'[\(<]+\s?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'),
-    (r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$', 'URL'),
-    # derived from regex in cluecode.finder
-    (r'<?a?.(href)?.('
-     r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
-     r'|(?:www|ftp)\.[^\s<>\[\]"]+'
-     r')\.?>?', 'URL'),
-
-    (r'^\(?<?https?://[a-zA-Z0-9_\-]+(\.([a-zA-Z0-9_\-])+)+.?\)?>?$', 'URL'),
-
-    # URLS with trailing/ such as http://fedorahosted.org/lohit/
-    # URLS with leading( such as (http://qbnz.com/highlighter/
-    (r'\(?https?:.*/', 'URL'),
-
     ############################################################################
     # Misc
     ############################################################################
@@ -3783,6 +3787,10 @@ def refine_names(s, prefixes):
     r'^\(c\) \(c\) B$',
     r'^\(c\) group$',
     r'^\(c\) \(c\) A$',
+    # URLs with (c) in path or query - these are false positives
+    r'.*https?://',  # contains http:// or https://
+    r'.*/.*\(c\)',  # has path-like structure with (c)
+    r'\(c\).*https?://',  # (c) followed by URL
 ]
 
 # a collection of junk junk matcher callables
diff --git a/tests/cluecode/data/copyrights/url_with_c_symbol.txt b/tests/cluecode/data/copyrights/url_with_c_symbol.txt
new file mode 100644
index 0000000000..e77e91a470
--- /dev/null
+++ b/tests/cluecode/data/copyrights/url_with_c_symbol.txt
@@ -0,0 +1,3 @@
+http://biblio.cesga.es:81/search*gag/aXove,+Xosé/axove+xose/7,-1,0,B/frameset&F=axuntanza&1,,3
+http://example.com/path/(c)/test
+http://test.org/query?param=(c)&other=value