split tokens only for author

alok1304 · alok1304 · commit 9554cfd29689 · 2025-04-10T19:31:53.000+05:30
Split tokens like 'Author:Frankie.Chu' into 'Author' and 'Frankie.Chu'

Signed-off-by: Alok Kumar &lt;alokkumarjipura9973@gmail.com&gt;
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
@@ -389,12 +389,12 @@ def detect(self,
                     yield author
 
 
-def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;:]+').split):
+def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
     """
     Return an iterable of pygmars.Token built from a ``numbered_lines`` iterable
     of tuples of (line number, text).
 
-    We perform a simple tokenization on spaces, tabs and some punctuation: =;:
+    We perform a simple tokenization on spaces, tabs and some punctuation: =;
     """
     last_line = ""
     for start_line, line in numbered_lines:
@@ -436,6 +436,17 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;:]+').split):
                 .rstrip(':')  # strip trailing colons
                 .strip()
             )
+            
+            # Split tokens like 'Author:Frankie.Chu' into 'Author' and 'Frankie.Chu'
+            if tok.startswith("Author:"):
+                parts = tok.split(":", 1)
+                if len(parts) == 2:
+                    for part in parts:
+                        part = part.strip()
+                        if part and part not in ':.':
+                            yield Token(value=part, start_line=start_line, pos=pos)
+                            pos += 1
+                    continue  
 
             # the tokenizer allows a single colon or dot to be a token and we discard these
             if tok and tok not in ':.':