-
Notifications
You must be signed in to change notification settings - Fork 21
Replace string-comparison with text-similarity-node for improved perf… #5429
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Joaco2603
wants to merge
2
commits into
relative-ci:master
Choose a base branch
from
Joaco2603:feature/replace-string-comparison
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
237 changes: 237 additions & 0 deletions
237
packages/utils/src/utils/__tests__/string-similarity.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,237 @@ | ||
| import { compareTwoStrings, extractBestCandidates, compareWithCosine } from '../string-similarity'; | ||
|
|
||
| describe('string-similarity', () => { | ||
| describe('compareTwoStrings', () => { | ||
| test('should return 1 for identical strings', () => { | ||
| expect(compareTwoStrings('hello', 'hello')).toBe(1); | ||
| expect(compareTwoStrings('test-file.js', 'test-file.js')).toBe(1); | ||
| }); | ||
|
|
||
| test('should return 0 for completely different strings', () => { | ||
| const result = compareTwoStrings('abc', 'xyz'); | ||
| expect(result).toBeGreaterThanOrEqual(0); | ||
| expect(result).toBeLessThan(0.5); // Should be low similarity | ||
| }); | ||
|
|
||
| test('should return high similarity for similar strings', () => { | ||
| const result = compareTwoStrings('hello', 'hallo'); | ||
| expect(result).toBeGreaterThan(0.8); // Jaro-Winkler is good at detecting typos | ||
| }); | ||
|
|
||
| test('should handle case sensitivity', () => { | ||
| // Case-insensitive (default) | ||
| const caseInsensitive = compareTwoStrings('Hello', 'hello', false); | ||
| expect(caseInsensitive).toBe(1); | ||
|
|
||
| // Case-sensitive | ||
| const caseSensitive = compareTwoStrings('Hello', 'hello', true); | ||
| expect(caseSensitive).toBeLessThan(1); | ||
| }); | ||
|
|
||
| test('should return 0 for empty strings', () => { | ||
| expect(compareTwoStrings('', 'hello')).toBe(0); | ||
| expect(compareTwoStrings('hello', '')).toBe(0); | ||
| expect(compareTwoStrings('', '')).toBe(0); | ||
| }); | ||
|
|
||
| test('should work with file paths', () => { | ||
| const path1 = 'static/js/main.abc123.chunk.js'; | ||
| const path2 = 'static/js/main.def456.chunk.js'; | ||
| const result = compareTwoStrings(path1, path2); | ||
| expect(result).toBeGreaterThan(0.8); // High similarity despite different hashes | ||
| }); | ||
|
|
||
| test('should work with webpack module names', () => { | ||
| const module1 = './node_modules/react/index.js'; | ||
| const module2 = './node_modules/react/index.jsx'; | ||
| const result = compareTwoStrings(module1, module2); | ||
| expect(result).toBeGreaterThan(0.9); // Very similar, just different extension | ||
| }); | ||
| }); | ||
|
|
||
| describe('extractBestCandidates', () => { | ||
| test('should find the best match from candidates', () => { | ||
| const result = extractBestCandidates('hello', ['hallo', 'world', 'help']); | ||
|
|
||
| expect(result.bestMatch.target).toBe('hallo'); | ||
| expect(result.bestMatch.rating).toBeGreaterThan(0.8); | ||
| expect(result.bestMatchIndex).toBe(0); | ||
| }); | ||
|
|
||
| test('should return all ratings', () => { | ||
| const result = extractBestCandidates('test', ['test', 'best', 'rest', 'west']); | ||
|
|
||
| expect(result.ratings).toHaveLength(4); | ||
| expect(result.ratings[0].target).toBe('test'); | ||
| expect(result.ratings[0].rating).toBe(1); // Exact match | ||
| }); | ||
|
|
||
| test('should handle file paths with hashes', () => { | ||
| const mainFile = 'main.abc123.js'; | ||
| const candidates = ['main.def456.js', 'vendor.abc123.js', 'runtime.xyz789.js']; | ||
|
|
||
| const result = extractBestCandidates(mainFile, candidates); | ||
|
|
||
| expect(result.bestMatch.target).toBe('main.def456.js'); | ||
| expect(result.bestMatch.rating).toBeGreaterThan(0.7); | ||
| }); | ||
|
|
||
| test('should handle webpack chunk names', () => { | ||
| const mainChunk = 'static/js/2.abc123.chunk.js'; | ||
| const candidates = [ | ||
| 'static/js/2.def456.chunk.js', | ||
| 'static/js/3.abc123.chunk.js', | ||
| 'static/css/2.abc123.chunk.css', | ||
| ]; | ||
|
|
||
| const result = extractBestCandidates(mainChunk, candidates); | ||
|
|
||
| // Both files have high similarity, but different strengths | ||
| // The algorithm finds the best match based on overall similarity | ||
| expect(result.bestMatch.rating).toBeGreaterThan(0.8); | ||
| expect(result.ratings).toHaveLength(3); | ||
|
|
||
| // Verify that static/js files have higher ratings than static/css | ||
| const jsFile1Rating = result.ratings[0].rating; | ||
| const jsFile2Rating = result.ratings[1].rating; | ||
| const cssFileRating = result.ratings[2].rating; | ||
|
|
||
| expect(Math.max(jsFile1Rating, jsFile2Rating)).toBeGreaterThan(cssFileRating); | ||
| }); | ||
|
|
||
| test('should handle module paths', () => { | ||
| const mainModule = './node_modules/lodash/get.js'; | ||
| const candidates = [ | ||
| './node_modules/lodash/set.js', | ||
| './node_modules/lodash/get.js', | ||
| './node_modules/react/index.js', | ||
| ]; | ||
|
|
||
| const result = extractBestCandidates(mainModule, candidates); | ||
|
|
||
| expect(result.bestMatch.target).toBe('./node_modules/lodash/get.js'); | ||
| expect(result.bestMatch.rating).toBe(1); // Exact match | ||
| }); | ||
|
|
||
| test('should handle empty candidates array', () => { | ||
| const result = extractBestCandidates('test', []); | ||
|
|
||
| expect(result.ratings).toHaveLength(0); | ||
| expect(result.bestMatch.target).toBe(''); | ||
| expect(result.bestMatch.rating).toBe(0); | ||
| expect(result.bestMatchIndex).toBe(-1); | ||
| }); | ||
|
|
||
| test('should handle empty main string', () => { | ||
| const result = extractBestCandidates('', ['test', 'best']); | ||
|
|
||
| expect(result.ratings).toHaveLength(0); | ||
| expect(result.bestMatch.target).toBe(''); | ||
| expect(result.bestMatch.rating).toBe(0); | ||
| expect(result.bestMatchIndex).toBe(-1); | ||
| }); | ||
|
|
||
| test('should handle case sensitivity', () => { | ||
| const candidates = ['Hello', 'hello', 'HELLO']; | ||
|
|
||
| // Case-insensitive (default) | ||
| const caseInsensitive = extractBestCandidates('hello', candidates, false); | ||
| expect(caseInsensitive.bestMatch.rating).toBe(1); | ||
|
|
||
| // Case-sensitive | ||
| const caseSensitive = extractBestCandidates('hello', candidates, true); | ||
| expect(caseSensitive.bestMatch.target).toBe('hello'); | ||
| expect(caseSensitive.bestMatch.rating).toBe(1); | ||
| }); | ||
|
|
||
| test('should work with real-world Next.js build output', () => { | ||
| const baselineFile = '_next/static/chunks/pages/index-abc123def.js'; | ||
| const currentFiles = [ | ||
| '_next/static/chunks/pages/index-xyz789abc.js', | ||
| '_next/static/chunks/pages/about-abc123def.js', | ||
| '_next/static/chunks/framework-123456789.js', | ||
| ]; | ||
|
|
||
| const result = extractBestCandidates(baselineFile, currentFiles); | ||
|
|
||
| expect(result.bestMatch.target).toBe('_next/static/chunks/pages/index-xyz789abc.js'); | ||
| expect(result.bestMatch.rating).toBeGreaterThan(0.7); | ||
| }); | ||
|
|
||
| test('should work with webpack module paths with loaders', () => { | ||
| const baselineModule = './src/components/Button.jsx'; | ||
| const currentModules = [ | ||
| './src/components/Button.jsx', | ||
| './src/components/Button.css', | ||
| './src/components/Link.jsx', | ||
| ]; | ||
|
|
||
| const result = extractBestCandidates(baselineModule, currentModules); | ||
|
|
||
| expect(result.bestMatch.target).toBe('./src/components/Button.jsx'); | ||
| expect(result.bestMatch.rating).toBe(1); | ||
| }); | ||
| }); | ||
|
|
||
| describe('compareWithCosine', () => { | ||
| test('should return 1 for identical strings', () => { | ||
| expect(compareWithCosine('hello world', 'hello world')).toBe(1); | ||
| }); | ||
|
|
||
| test('should handle word-based tokenization', () => { | ||
| const result = compareWithCosine('hello world', 'world hello', true); | ||
| expect(result).toBeGreaterThan(0.9); // Word order shouldn't matter much | ||
| }); | ||
|
|
||
| test('should handle character-based tokenization', () => { | ||
| const result = compareWithCosine('hello', 'hallo', false); | ||
| expect(result).toBeGreaterThanOrEqual(0.5); | ||
| }); | ||
|
|
||
| test('should return 0 for empty strings', () => { | ||
| expect(compareWithCosine('', 'hello')).toBe(0); | ||
| expect(compareWithCosine('hello', '')).toBe(0); | ||
| }); | ||
|
|
||
| test('should work with long file paths', () => { | ||
| const path1 = './src/components/features/dashboard/analytics/ChartComponent.jsx'; | ||
| const path2 = './src/components/features/dashboard/analytics/TableComponent.jsx'; | ||
| const result = compareWithCosine(path1, path2, true); | ||
| expect(result).toBeGreaterThan(0.6); // Should have high similarity | ||
| }); | ||
| }); | ||
|
|
||
| describe('Performance characteristics', () => { | ||
| test('should handle large candidate lists efficiently', () => { | ||
| const mainString = 'test-file-123.js'; | ||
| const candidates = Array.from({ length: 1000 }, (_, i) => `file-${i}.js`); | ||
| candidates.push('test-file-456.js'); // Add a similar file | ||
|
|
||
| const startTime = performance.now(); | ||
| const result = extractBestCandidates(mainString, candidates); | ||
| const endTime = performance.now(); | ||
|
|
||
| expect(result.bestMatch.target).toBe('test-file-456.js'); | ||
| expect(endTime - startTime).toBeLessThan(1000); // Should complete in less than 1 second | ||
| }); | ||
| }); | ||
|
|
||
| describe('Edge cases', () => { | ||
| test('should handle strings with special characters', () => { | ||
| const result = compareTwoStrings('test-file@2.0.0.js', 'test-file@2.0.1.js'); | ||
| expect(result).toBeGreaterThan(0.8); | ||
| }); | ||
|
|
||
| test('should handle unicode characters', () => { | ||
| const result = compareTwoStrings('café', 'cafe', false); | ||
| expect(result).toBeGreaterThan(0.5); | ||
| }); | ||
|
|
||
| test('should handle very long strings', () => { | ||
| const longString1 = 'a'.repeat(1000); | ||
| const longString2 = `${'a'.repeat(999)}b`; | ||
| const result = compareTwoStrings(longString1, longString2); | ||
| expect(result).toBeGreaterThan(0.9); // Should be very similar | ||
| }); | ||
| }); | ||
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| import * as textSimilarity from 'text-similarity-node'; | ||
|
|
||
| export interface BestMatch { | ||
| target: string; | ||
| rating: number; | ||
| } | ||
|
|
||
| export interface BestMatchResult { | ||
| ratings: BestMatch[]; | ||
| bestMatch: BestMatch; | ||
| bestMatchIndex: number; | ||
| } | ||
|
|
||
| /** | ||
| * Compare two strings and return a similarity score between 0 and 1 | ||
| * Uses Jaro-Winkler algorithm which is optimized for short strings and proper names | ||
| * | ||
| * @param str1 - First string to compare | ||
| * @param str2 - Second string to compare | ||
| * @param caseSensitive - Whether comparison should be case-sensitive (default: false) | ||
| * @returns Similarity score between 0 (completely different) and 1 (identical) | ||
| */ | ||
| export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => { | ||
| if (!str1 || !str2) { | ||
| return 0; | ||
| } | ||
|
|
||
| if (str1 === str2) { | ||
| return 1; | ||
| } | ||
|
|
||
| // Use Jaro-Winkler algorithm for better performance with file paths and names | ||
| return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive); | ||
| }; | ||
|
|
||
| /** | ||
| * Find the best matching strings from a list of candidates | ||
| * | ||
| * @param mainString - The string to compare against | ||
| * @param targetStrings - Array of candidate strings to compare | ||
| * @param caseSensitive - Whether comparison should be case-sensitive (default: false) | ||
| * @returns Object containing all ratings, best match, and best match index | ||
| * | ||
| * @example | ||
| * ```typescript | ||
| * const result = extractBestCandidates('hello', ['hallo', 'world', 'help']); | ||
| * // result.bestMatch: { target: 'hallo', rating: 0.92 } | ||
| * // result.bestMatchIndex: 0 | ||
| * // result.ratings: [ | ||
| * // { target: 'hallo', rating: 0.92 }, | ||
| * // { target: 'world', rating: 0.46 }, | ||
| * // { target: 'help', rating: 0.85 } | ||
| * // ] | ||
| * ``` | ||
| */ | ||
| export const extractBestCandidates = ( | ||
| mainString: string, | ||
| targetStrings: string[], | ||
| caseSensitive = false, | ||
| ): BestMatchResult => { | ||
| if (!mainString || !targetStrings || targetStrings.length === 0) { | ||
| return { | ||
| ratings: [], | ||
| bestMatch: { target: '', rating: 0 }, | ||
| bestMatchIndex: -1, | ||
| }; | ||
| } | ||
|
|
||
| // Calculate similarity for all candidates | ||
| const ratings: BestMatch[] = targetStrings.map((target) => ({ | ||
| target, | ||
| rating: compareTwoStrings(mainString, target, caseSensitive), | ||
| })); | ||
|
|
||
| // Find the best match | ||
| let bestMatchIndex = 0; | ||
| let bestRating = ratings[0].rating; | ||
|
|
||
| for (let i = 1; i < ratings.length; i += 1) { | ||
| if (ratings[i].rating > bestRating) { | ||
| bestRating = ratings[i].rating; | ||
| bestMatchIndex = i; | ||
| } | ||
| } | ||
|
|
||
| return { | ||
| ratings, | ||
| bestMatch: ratings[bestMatchIndex], | ||
| bestMatchIndex, | ||
| }; | ||
| }; | ||
|
|
||
| /** | ||
| * Alternative comparison using Cosine similarity for longer strings | ||
| * This is more suitable for comparing longer file paths or content | ||
| * | ||
| * @param str1 - First string to compare | ||
| * @param str2 - Second string to compare | ||
| * @param useWords - Whether to tokenize by words (true) or characters (false) | ||
| * @param caseSensitive - Whether comparison should be case-sensitive | ||
| * @returns Similarity score between 0 and 1 | ||
| */ | ||
| export const compareWithCosine = ( | ||
| str1: string, | ||
| str2: string, | ||
| useWords = true, | ||
| caseSensitive = false, | ||
| ): number => { | ||
| if (!str1 || !str2) { | ||
| return 0; | ||
| } | ||
|
|
||
| if (str1 === str2) { | ||
| return 1; | ||
| } | ||
|
|
||
| return textSimilarity.similarity.cosine(str1, str2, useWords, caseSensitive); | ||
| }; | ||
|
Comment on lines
+103
to
+118
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Critical bug: Same case-insensitive comparison issue. This function has the same bug as Apply this diff to fix the issue: export const compareWithCosine = (
str1: string,
str2: string,
useWords = true,
caseSensitive = false,
): number => {
if (!str1 || !str2) {
return 0;
}
- if (str1 === str2) {
+ // Normalize for case-insensitive comparison
+ const normalizedStr1 = caseSensitive ? str1 : str1.toLowerCase();
+ const normalizedStr2 = caseSensitive ? str2 : str2.toLowerCase();
+
+ if (normalizedStr1 === normalizedStr2) {
return 1;
}
return textSimilarity.similarity.cosine(str1, str2, useWords, caseSensitive);
};🤖 Prompt for AI Agents |
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Critical bug: Case-insensitive comparison broken by early exact equality check.
The exact equality check on line 28 uses
===, which is always case-sensitive. This breaks case-insensitive comparison. For example,compareTwoStrings('Hello', 'hello', false)should return 1 (as expected by the test at line 23-24), but the current implementation will:===check (returns false)jaroWinkler('Hello', 'hello', false)which returns a value < 1Apply this diff to fix the issue:
export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => { if (!str1 || !str2) { return 0; } - if (str1 === str2) { + // Normalize for case-insensitive comparison + const normalizedStr1 = caseSensitive ? str1 : str1.toLowerCase(); + const normalizedStr2 = caseSensitive ? str2 : str2.toLowerCase(); + + if (normalizedStr1 === normalizedStr2) { return 1; } // Use Jaro-Winkler algorithm for better performance with file paths and names return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive); };📝 Committable suggestion
🤖 Prompt for AI Agents