Skip to content

Detect "SPDX Short Identifier" tags #4301 #4302

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 55 additions & 17 deletions src/licensedcode/match_spdx_lid.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,33 +392,71 @@ def clean_text(text):


_split_spdx_lid = re.compile(
'(spd[xz][\\-\\s]+lin?[cs]en?[sc]es?[\\-\\s]+identifi?er\\s*:?\\s*)',
re.IGNORECASE).split

_nuget_split_spdx_lid = re.compile(
'(licenses(?:\\.|\\s)+nuget(?:\\.|\\s)+org\\s*:?\\s*)',
re.IGNORECASE).split
r'('
r'(?:'
r'spd[xz][_\-\s]+'
r'(?:lin?[cs]en?[sc]es?|short)[_\-\s]+'
'identifi?ers?\s*:?'
r'|'
r'licenses[\.\s]+nuget[\.\s]+org\s*/?'
r')\s*'
r')',
re.IGNORECASE,
).split


def split_spdx_lid(text):
"""
Split text if it contains an "SPDX license identifier". Return a 2-tuple if if there is an SPDX
Split text if it contains an "SPDX license identifier". Return a 2-tuple if there is an SPDX
license identifier where the first item contains the "SPDX license identifier" text proper and
the second item contains the remainder of the line (expected to be a license expression).
Otherwise return a 2-tuple where the first item is None and the second item contains the
original text.

Also supports "https://licenses.nuget.org" followed by a license expression.
Also supports "https://licenses.nuget.org" followed by a license expression as well as minor
variants such as SPDX short Indentifier, and typos.

Split regex examples::

>>> _split_spdx_lid("licenses.nuget.org/MIT%20OR%20Unlicense")
['', 'licenses.nuget.org/', 'MIT%20OR%20Unlicense']
>>> _split_spdx_lid("licenses.nuget.org / MIT")
['', 'licenses.nuget.org / ', 'MIT']
>>> _split_spdx_lid("licenseUrl:https://licenses.nuget.org/MIT%20OR%20Unlicense")
['licenseUrl:https://', 'licenses.nuget.org/', 'MIT%20OR%20Unlicense']
>>> _split_spdx_lid("SPDX-license-Identifier: MIT OR Unlicense")
['', 'SPDX-license-Identifier: ', 'MIT OR Unlicense']
>>> _split_spdx_lid("SPDX-license-Identifer: MIT OR Unlicense")
['', 'SPDX-license-Identifer: ', 'MIT OR Unlicense']
>>> _split_spdx_lid("SPDX short Identifer : MIT OR Unlicense")
['', 'SPDX short Identifer : ', 'MIT OR Unlicense']
>>> _split_spdx_lid("For OR Unlicense")
['For OR Unlicense']
>>> _split_spdx_lid(" REM DNL SPDX short Identifer : MIT OR Unlicense")
[' REM DNL ', 'SPDX short Identifer : ', 'MIT OR Unlicense']

Split full examples::

>>> split_spdx_lid("licenses.nuget.org/MIT%20OR%20Unlicense")
('licenses.nuget.org/', 'MIT%20OR%20Unlicense')
>>> split_spdx_lid("licenses.nuget.org / MIT")
('licenses.nuget.org / ', 'MIT')
>>> split_spdx_lid("licenseUrl:https://licenses.nuget.org/MIT%20OR%20Unlicense")
('licenses.nuget.org/', 'MIT%20OR%20Unlicense')
>>> split_spdx_lid("SPDX-license-Identifier: MIT OR Unlicense")
('SPDX-license-Identifier: ', 'MIT OR Unlicense')
>>> split_spdx_lid("SPDX-license-Identifer: MIT OR Unlicense")
('SPDX-license-Identifer: ', 'MIT OR Unlicense')
>>> split_spdx_lid("SPDX short Identifer : MIT OR Unlicense")
('SPDX short Identifer : ', 'MIT OR Unlicense')
>>> split_spdx_lid("For OR Unlicense")
(None, 'For OR Unlicense')
"""
segments = _split_spdx_lid(text)
expression = segments[-1]
if len(segments) > 1:
return segments[-2], expression
if len(segments) == 3:
# we matched on split OK with exactly three segments
_, prefix, expression = segments
return prefix, expression
else:
segments = _nuget_split_spdx_lid(text)
expression = segments[-1]
if len(segments) > 1:
return segments[-2], expression
else:
return None, text
return None, text

11 changes: 7 additions & 4 deletions src/licensedcode/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ def logger_debug(*args):
# on a single line (e.g. minified JS or CSS).
MAX_TOKEN_PER_LINE = 25


# Break quary in runs if there are `LINES_THRESHOLD` number of empty
# or non-legalese/junk lines
LINES_THRESHOLD = 4
Expand Down Expand Up @@ -248,19 +247,23 @@ def __init__(
# TODO: consider using an intbitset
self.shorts_and_digits_pos = set()

# list of the three SPDX-License-Identifier tokens to identify to detect
# list of the base SPDX-License-Identifier tokens to identify and detect
# a line for SPDX id matching.
# note: this will not match anything if the index is not properly set
dic_get = idx.dictionary.get
spdxid = [dic_get(u'spdx'), dic_get(u'license'), dic_get(u'identifier')]

# "SPDX Short identifier" is also an unfortunate thing in the wild
# both with and without dash
spdxid2 = [dic_get(u'spdx'), dic_get(u'short'), dic_get(u'identifier')]

# There's also other spdx license identifiers like NuGet license URLs
# Like: `https://licenses.nuget.org/(LGPL-2.0-only WITH FLTK-exception OR Apache-2.0+)`
nuget_spdx_id = [dic_get(u'licenses'), dic_get(u'nuget'), dic_get(u'org')]

# None, None None: this is mostly a possible issue in test mode
self.spdx_lid_token_ids = [
x for x in [spdxid, nuget_spdx_id, ] if x != [None, None, None]
x for x in [spdxid, nuget_spdx_id, spdxid2] if None not in x
]

# list of tuple (original line text, start known pos, end known pos) for
Expand Down Expand Up @@ -497,7 +500,7 @@ def tokens_by_line(
spdx_start_offset = 2

if spdx_start_offset is not None:

# keep the line, start/end known pos for SPDX matching
spdx_prefix, spdx_expression = split_spdx_lid(line)
spdx_text = ''.join([spdx_prefix or '', spdx_expression])
Expand Down
Loading
Loading