aboutcode-org · pombredanne · May 5, 2025
diff --git a/src/licensedcode/match_spdx_lid.py b/src/licensedcode/match_spdx_lid.py
@@ -392,33 +392,71 @@ def clean_text(text):
 
 
 _split_spdx_lid = re.compile(
-    '(spd[xz][\\-\\s]+lin?[cs]en?[sc]es?[\\-\\s]+identifi?er\\s*:?\\s*)',
-    re.IGNORECASE).split
-
-_nuget_split_spdx_lid = re.compile(
-    '(licenses(?:\\.|\\s)+nuget(?:\\.|\\s)+org\\s*:?\\s*)',
-    re.IGNORECASE).split
+    r'('
+    r'(?:'
+    r'spd[xz][_\-\s]+'
+    r'(?:lin?[cs]en?[sc]es?|short)[_\-\s]+'
+    'identifi?ers?\s*:?'
+    r'|'
+    r'licenses[\.\s]+nuget[\.\s]+org\s*/?'
+    r')\s*'
+    r')',
+    re.IGNORECASE,
+).split
 
 
 def split_spdx_lid(text):
     """
-    Split text if it contains an "SPDX license identifier". Return a 2-tuple if if there is an SPDX
+    Split text if it contains an "SPDX license identifier". Return a 2-tuple if there is an SPDX
     license identifier where the first item contains the "SPDX license identifier" text proper and
     the second item contains the remainder of the line (expected to be a license expression).
     Otherwise return a 2-tuple where the first item is None and the second item contains the
     original text.
 
-    Also supports "https://licenses.nuget.org" followed by a license expression.
+    Also supports "https://licenses.nuget.org" followed by a license expression as well as minor
+    variants such as SPDX short Indentifier, and typos.
+
+    Split regex examples::
+
+    >>> _split_spdx_lid("licenses.nuget.org/MIT%20OR%20Unlicense")
+    ['', 'licenses.nuget.org/', 'MIT%20OR%20Unlicense']
+    >>> _split_spdx_lid("licenses.nuget.org / MIT")
+    ['', 'licenses.nuget.org / ', 'MIT']
+    >>> _split_spdx_lid("licenseUrl:https://licenses.nuget.org/MIT%20OR%20Unlicense")
+    ['licenseUrl:https://', 'licenses.nuget.org/', 'MIT%20OR%20Unlicense']
+    >>> _split_spdx_lid("SPDX-license-Identifier: MIT OR Unlicense")
+    ['', 'SPDX-license-Identifier: ', 'MIT OR Unlicense']
+    >>> _split_spdx_lid("SPDX-license-Identifer: MIT OR Unlicense")
+    ['', 'SPDX-license-Identifer: ', 'MIT OR Unlicense']
+    >>> _split_spdx_lid("SPDX short Identifer : MIT OR Unlicense")
+    ['', 'SPDX short Identifer : ', 'MIT OR Unlicense']
+    >>> _split_spdx_lid("For OR Unlicense")
+    ['For OR Unlicense']
+    >>> _split_spdx_lid(" REM DNL SPDX short Identifer : MIT OR Unlicense")
+    [' REM DNL ', 'SPDX short Identifer : ', 'MIT OR Unlicense']
+
+    Split full examples::
+
+    >>> split_spdx_lid("licenses.nuget.org/MIT%20OR%20Unlicense")
+    ('licenses.nuget.org/', 'MIT%20OR%20Unlicense')
+    >>> split_spdx_lid("licenses.nuget.org / MIT")
+    ('licenses.nuget.org / ', 'MIT')
+    >>> split_spdx_lid("licenseUrl:https://licenses.nuget.org/MIT%20OR%20Unlicense")
+    ('licenses.nuget.org/', 'MIT%20OR%20Unlicense')
+    >>> split_spdx_lid("SPDX-license-Identifier: MIT OR Unlicense")
+    ('SPDX-license-Identifier: ', 'MIT OR Unlicense')
+    >>> split_spdx_lid("SPDX-license-Identifer: MIT OR Unlicense")
+    ('SPDX-license-Identifer: ', 'MIT OR Unlicense')
+    >>> split_spdx_lid("SPDX short Identifer : MIT OR Unlicense")
+    ('SPDX short Identifer : ', 'MIT OR Unlicense')
+    >>> split_spdx_lid("For OR Unlicense")
+    (None, 'For OR Unlicense')
     """
     segments = _split_spdx_lid(text)
-    expression = segments[-1]
-    if len(segments) > 1:
-        return segments[-2], expression
+    if len(segments) == 3:
+        # we matched on split OK with exactly three segments
+        _, prefix, expression = segments
+        return prefix, expression
     else:
-        segments = _nuget_split_spdx_lid(text)
-        expression = segments[-1]
-        if len(segments) > 1:
-            return segments[-2], expression
-        else:
-            return None, text
+        return None, text
 
diff --git a/src/licensedcode/query.py b/src/licensedcode/query.py
@@ -102,7 +102,6 @@ def logger_debug(*args):
 # on a single line (e.g. minified JS or CSS).
 MAX_TOKEN_PER_LINE = 25
 
-
 # Break quary in runs if there are `LINES_THRESHOLD` number of empty
 # or non-legalese/junk lines
 LINES_THRESHOLD = 4
@@ -248,19 +247,23 @@ def __init__(
         # TODO: consider using an intbitset
         self.shorts_and_digits_pos = set()
 
-        # list of the three SPDX-License-Identifier tokens to identify to detect
+        # list of the base SPDX-License-Identifier tokens to identify and detect
         # a line for SPDX id matching.
         # note: this will not match anything if the index is not properly set
         dic_get = idx.dictionary.get
         spdxid = [dic_get(u'spdx'), dic_get(u'license'), dic_get(u'identifier')]
 
+        # "SPDX Short identifier" is also an unfortunate thing in the wild
+        # both with and without dash
+        spdxid2 = [dic_get(u'spdx'), dic_get(u'short'), dic_get(u'identifier')]
+
         # There's also other spdx license identifiers like NuGet license URLs
         # Like: `https://licenses.nuget.org/(LGPL-2.0-only WITH FLTK-exception OR Apache-2.0+)`
         nuget_spdx_id = [dic_get(u'licenses'), dic_get(u'nuget'), dic_get(u'org')]
 
         # None, None None: this is mostly a possible issue in test mode
         self.spdx_lid_token_ids = [
-            x for x in [spdxid, nuget_spdx_id, ] if x != [None, None, None]
+            x for x in [spdxid, nuget_spdx_id, spdxid2] if None not in x
         ]
 
         # list of tuple (original line text, start known pos, end known pos) for
@@ -497,7 +500,7 @@ def tokens_by_line(
                 spdx_start_offset = 2
 
             if spdx_start_offset is not None:
-                    
+
                 # keep the line, start/end known pos for SPDX matching
                 spdx_prefix, spdx_expression = split_spdx_lid(line)
                 spdx_text = ''.join([spdx_prefix or '', spdx_expression])