Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion packages/utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
"dependencies": {
"@bundle-stats/plugin-webpack-filter": "^4.21.6",
"@bundle-stats/plugin-webpack-validate": "^4.21.6",
"serialize-query-params": "2.0.2"
"serialize-query-params": "2.0.2",
"text-similarity-node": "^1.0.1"
},
"devDependencies": {
"@types/jest": "29.5.14",
Expand Down
237 changes: 237 additions & 0 deletions packages/utils/src/utils/__tests__/string-similarity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
import { compareTwoStrings, extractBestCandidates, compareWithCosine } from '../string-similarity';

describe('string-similarity', () => {
describe('compareTwoStrings', () => {
test('should return 1 for identical strings', () => {
expect(compareTwoStrings('hello', 'hello')).toBe(1);
expect(compareTwoStrings('test-file.js', 'test-file.js')).toBe(1);
});

test('should return 0 for completely different strings', () => {
const result = compareTwoStrings('abc', 'xyz');
expect(result).toBeGreaterThanOrEqual(0);
expect(result).toBeLessThan(0.5); // Should be low similarity
});

test('should return high similarity for similar strings', () => {
const result = compareTwoStrings('hello', 'hallo');
expect(result).toBeGreaterThan(0.8); // Jaro-Winkler is good at detecting typos
});

test('should handle case sensitivity', () => {
// Case-insensitive (default)
const caseInsensitive = compareTwoStrings('Hello', 'hello', false);
expect(caseInsensitive).toBe(1);

// Case-sensitive
const caseSensitive = compareTwoStrings('Hello', 'hello', true);
expect(caseSensitive).toBeLessThan(1);
});

test('should return 0 for empty strings', () => {
expect(compareTwoStrings('', 'hello')).toBe(0);
expect(compareTwoStrings('hello', '')).toBe(0);
expect(compareTwoStrings('', '')).toBe(0);
});

test('should work with file paths', () => {
const path1 = 'static/js/main.abc123.chunk.js';
const path2 = 'static/js/main.def456.chunk.js';
const result = compareTwoStrings(path1, path2);
expect(result).toBeGreaterThan(0.8); // High similarity despite different hashes
});

test('should work with webpack module names', () => {
const module1 = './node_modules/react/index.js';
const module2 = './node_modules/react/index.jsx';
const result = compareTwoStrings(module1, module2);
expect(result).toBeGreaterThan(0.9); // Very similar, just different extension
});
});

describe('extractBestCandidates', () => {
test('should find the best match from candidates', () => {
const result = extractBestCandidates('hello', ['hallo', 'world', 'help']);

expect(result.bestMatch.target).toBe('hallo');
expect(result.bestMatch.rating).toBeGreaterThan(0.8);
expect(result.bestMatchIndex).toBe(0);
});

test('should return all ratings', () => {
const result = extractBestCandidates('test', ['test', 'best', 'rest', 'west']);

expect(result.ratings).toHaveLength(4);
expect(result.ratings[0].target).toBe('test');
expect(result.ratings[0].rating).toBe(1); // Exact match
});

test('should handle file paths with hashes', () => {
const mainFile = 'main.abc123.js';
const candidates = ['main.def456.js', 'vendor.abc123.js', 'runtime.xyz789.js'];

const result = extractBestCandidates(mainFile, candidates);

expect(result.bestMatch.target).toBe('main.def456.js');
expect(result.bestMatch.rating).toBeGreaterThan(0.7);
});

test('should handle webpack chunk names', () => {
const mainChunk = 'static/js/2.abc123.chunk.js';
const candidates = [
'static/js/2.def456.chunk.js',
'static/js/3.abc123.chunk.js',
'static/css/2.abc123.chunk.css',
];

const result = extractBestCandidates(mainChunk, candidates);

// Both files have high similarity, but different strengths
// The algorithm finds the best match based on overall similarity
expect(result.bestMatch.rating).toBeGreaterThan(0.8);
expect(result.ratings).toHaveLength(3);

// Verify that static/js files have higher ratings than static/css
const jsFile1Rating = result.ratings[0].rating;
const jsFile2Rating = result.ratings[1].rating;
const cssFileRating = result.ratings[2].rating;

expect(Math.max(jsFile1Rating, jsFile2Rating)).toBeGreaterThan(cssFileRating);
});

test('should handle module paths', () => {
const mainModule = './node_modules/lodash/get.js';
const candidates = [
'./node_modules/lodash/set.js',
'./node_modules/lodash/get.js',
'./node_modules/react/index.js',
];

const result = extractBestCandidates(mainModule, candidates);

expect(result.bestMatch.target).toBe('./node_modules/lodash/get.js');
expect(result.bestMatch.rating).toBe(1); // Exact match
});

test('should handle empty candidates array', () => {
const result = extractBestCandidates('test', []);

expect(result.ratings).toHaveLength(0);
expect(result.bestMatch.target).toBe('');
expect(result.bestMatch.rating).toBe(0);
expect(result.bestMatchIndex).toBe(-1);
});

test('should handle empty main string', () => {
const result = extractBestCandidates('', ['test', 'best']);

expect(result.ratings).toHaveLength(0);
expect(result.bestMatch.target).toBe('');
expect(result.bestMatch.rating).toBe(0);
expect(result.bestMatchIndex).toBe(-1);
});

test('should handle case sensitivity', () => {
const candidates = ['Hello', 'hello', 'HELLO'];

// Case-insensitive (default)
const caseInsensitive = extractBestCandidates('hello', candidates, false);
expect(caseInsensitive.bestMatch.rating).toBe(1);

// Case-sensitive
const caseSensitive = extractBestCandidates('hello', candidates, true);
expect(caseSensitive.bestMatch.target).toBe('hello');
expect(caseSensitive.bestMatch.rating).toBe(1);
});

test('should work with real-world Next.js build output', () => {
const baselineFile = '_next/static/chunks/pages/index-abc123def.js';
const currentFiles = [
'_next/static/chunks/pages/index-xyz789abc.js',
'_next/static/chunks/pages/about-abc123def.js',
'_next/static/chunks/framework-123456789.js',
];

const result = extractBestCandidates(baselineFile, currentFiles);

expect(result.bestMatch.target).toBe('_next/static/chunks/pages/index-xyz789abc.js');
expect(result.bestMatch.rating).toBeGreaterThan(0.7);
});

test('should work with webpack module paths with loaders', () => {
const baselineModule = './src/components/Button.jsx';
const currentModules = [
'./src/components/Button.jsx',
'./src/components/Button.css',
'./src/components/Link.jsx',
];

const result = extractBestCandidates(baselineModule, currentModules);

expect(result.bestMatch.target).toBe('./src/components/Button.jsx');
expect(result.bestMatch.rating).toBe(1);
});
});

describe('compareWithCosine', () => {
test('should return 1 for identical strings', () => {
expect(compareWithCosine('hello world', 'hello world')).toBe(1);
});

test('should handle word-based tokenization', () => {
const result = compareWithCosine('hello world', 'world hello', true);
expect(result).toBeGreaterThan(0.9); // Word order shouldn't matter much
});

test('should handle character-based tokenization', () => {
const result = compareWithCosine('hello', 'hallo', false);
expect(result).toBeGreaterThanOrEqual(0.5);
});

test('should return 0 for empty strings', () => {
expect(compareWithCosine('', 'hello')).toBe(0);
expect(compareWithCosine('hello', '')).toBe(0);
});

test('should work with long file paths', () => {
const path1 = './src/components/features/dashboard/analytics/ChartComponent.jsx';
const path2 = './src/components/features/dashboard/analytics/TableComponent.jsx';
const result = compareWithCosine(path1, path2, true);
expect(result).toBeGreaterThan(0.6); // Should have high similarity
});
});

describe('Performance characteristics', () => {
test('should handle large candidate lists efficiently', () => {
const mainString = 'test-file-123.js';
const candidates = Array.from({ length: 1000 }, (_, i) => `file-${i}.js`);
candidates.push('test-file-456.js'); // Add a similar file

const startTime = performance.now();
const result = extractBestCandidates(mainString, candidates);
const endTime = performance.now();

expect(result.bestMatch.target).toBe('test-file-456.js');
expect(endTime - startTime).toBeLessThan(1000); // Should complete in less than 1 second
});
});

describe('Edge cases', () => {
test('should handle strings with special characters', () => {
const result = compareTwoStrings('test-file@2.0.0.js', 'test-file@2.0.1.js');
expect(result).toBeGreaterThan(0.8);
});

test('should handle unicode characters', () => {
const result = compareTwoStrings('café', 'cafe', false);
expect(result).toBeGreaterThan(0.5);
});

test('should handle very long strings', () => {
const longString1 = 'a'.repeat(1000);
const longString2 = `${'a'.repeat(999)}b`;
const result = compareTwoStrings(longString1, longString2);
expect(result).toBeGreaterThan(0.9); // Should be very similar
});
});
});
1 change: 1 addition & 0 deletions packages/utils/src/utils/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ export * from './insights';
export * from './file-types';
export * from './format';
export * from './metrics';
export * from './string-similarity';
118 changes: 118 additions & 0 deletions packages/utils/src/utils/string-similarity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import * as textSimilarity from 'text-similarity-node';

export interface BestMatch {
target: string;
rating: number;
}

export interface BestMatchResult {
ratings: BestMatch[];
bestMatch: BestMatch;
bestMatchIndex: number;
}

/**
* Compare two strings and return a similarity score between 0 and 1
* Uses Jaro-Winkler algorithm which is optimized for short strings and proper names
*
* @param str1 - First string to compare
* @param str2 - Second string to compare
* @param caseSensitive - Whether comparison should be case-sensitive (default: false)
* @returns Similarity score between 0 (completely different) and 1 (identical)
*/
export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => {
if (!str1 || !str2) {
return 0;
}

if (str1 === str2) {
return 1;
}

// Use Jaro-Winkler algorithm for better performance with file paths and names
return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive);
};
Comment on lines +23 to +34
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical bug: Case-insensitive comparison broken by early exact equality check.

The exact equality check on line 28 uses ===, which is always case-sensitive. This breaks case-insensitive comparison. For example, compareTwoStrings('Hello', 'hello', false) should return 1 (as expected by the test at line 23-24), but the current implementation will:

  1. Pass through the === check (returns false)
  2. Call jaroWinkler('Hello', 'hello', false) which returns a value < 1

Apply this diff to fix the issue:

 export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => {
   if (!str1 || !str2) {
     return 0;
   }
 
-  if (str1 === str2) {
+  // Normalize for case-insensitive comparison
+  const normalizedStr1 = caseSensitive ? str1 : str1.toLowerCase();
+  const normalizedStr2 = caseSensitive ? str2 : str2.toLowerCase();
+
+  if (normalizedStr1 === normalizedStr2) {
     return 1;
   }
 
   // Use Jaro-Winkler algorithm for better performance with file paths and names
   return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive);
 };
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => {
if (!str1 || !str2) {
return 0;
}
if (str1 === str2) {
return 1;
}
// Use Jaro-Winkler algorithm for better performance with file paths and names
return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive);
};
export const compareTwoStrings = (str1: string, str2: string, caseSensitive = false): number => {
if (!str1 || !str2) {
return 0;
}
// Normalize for case-insensitive comparison
const normalizedStr1 = caseSensitive ? str1 : str1.toLowerCase();
const normalizedStr2 = caseSensitive ? str2 : str2.toLowerCase();
if (normalizedStr1 === normalizedStr2) {
return 1;
}
// Use Jaro-Winkler algorithm for better performance with file paths and names
return textSimilarity.similarity.jaroWinkler(str1, str2, caseSensitive);
};
🤖 Prompt for AI Agents
In packages/utils/src/utils/string-similarity.ts around lines 23 to 34, the
early exact equality check uses === which is always case-sensitive and breaks
case-insensitive mode; change that check to respect the caseSensitive parameter
(i.e., if caseSensitive is true use ===, otherwise compare str1.toLowerCase()
=== str2.toLowerCase()) so case-insensitive equal strings return 1 and only fall
through to jaroWinkler when they truly differ.


/**
* Find the best matching strings from a list of candidates
*
* @param mainString - The string to compare against
* @param targetStrings - Array of candidate strings to compare
* @param caseSensitive - Whether comparison should be case-sensitive (default: false)
* @returns Object containing all ratings, best match, and best match index
*
* @example
* ```typescript
* const result = extractBestCandidates('hello', ['hallo', 'world', 'help']);
* // result.bestMatch: { target: 'hallo', rating: 0.92 }
* // result.bestMatchIndex: 0
* // result.ratings: [
* // { target: 'hallo', rating: 0.92 },
* // { target: 'world', rating: 0.46 },
* // { target: 'help', rating: 0.85 }
* // ]
* ```
*/
export const extractBestCandidates = (
mainString: string,
targetStrings: string[],
caseSensitive = false,
): BestMatchResult => {
if (!mainString || !targetStrings || targetStrings.length === 0) {
return {
ratings: [],
bestMatch: { target: '', rating: 0 },
bestMatchIndex: -1,
};
}

// Calculate similarity for all candidates
const ratings: BestMatch[] = targetStrings.map((target) => ({
target,
rating: compareTwoStrings(mainString, target, caseSensitive),
}));

// Find the best match
let bestMatchIndex = 0;
let bestRating = ratings[0].rating;

for (let i = 1; i < ratings.length; i += 1) {
if (ratings[i].rating > bestRating) {
bestRating = ratings[i].rating;
bestMatchIndex = i;
}
}

return {
ratings,
bestMatch: ratings[bestMatchIndex],
bestMatchIndex,
};
};

/**
* Alternative comparison using Cosine similarity for longer strings
* This is more suitable for comparing longer file paths or content
*
* @param str1 - First string to compare
* @param str2 - Second string to compare
* @param useWords - Whether to tokenize by words (true) or characters (false)
* @param caseSensitive - Whether comparison should be case-sensitive
* @returns Similarity score between 0 and 1
*/
export const compareWithCosine = (
str1: string,
str2: string,
useWords = true,
caseSensitive = false,
): number => {
if (!str1 || !str2) {
return 0;
}

if (str1 === str2) {
return 1;
}

return textSimilarity.similarity.cosine(str1, str2, useWords, caseSensitive);
};
Comment on lines +103 to +118
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical bug: Same case-insensitive comparison issue.

This function has the same bug as compareTwoStrings. The exact equality check on line 113 uses ===, which breaks case-insensitive comparison.

Apply this diff to fix the issue:

 export const compareWithCosine = (
   str1: string,
   str2: string,
   useWords = true,
   caseSensitive = false,
 ): number => {
   if (!str1 || !str2) {
     return 0;
   }
 
-  if (str1 === str2) {
+  // Normalize for case-insensitive comparison
+  const normalizedStr1 = caseSensitive ? str1 : str1.toLowerCase();
+  const normalizedStr2 = caseSensitive ? str2 : str2.toLowerCase();
+
+  if (normalizedStr1 === normalizedStr2) {
     return 1;
   }
 
   return textSimilarity.similarity.cosine(str1, str2, useWords, caseSensitive);
 };
🤖 Prompt for AI Agents
In packages/utils/src/utils/string-similarity.ts around lines 103 to 118, the
exact-equality check uses === which ignores the caseSensitive parameter and
breaks case-insensitive comparisons; change the equality check to compare
normalized strings: if caseSensitive is false, compare str1.toLowerCase() ===
str2.toLowerCase(), otherwise keep the existing strict equality, then proceed to
call textSimilarity.similarity.cosine with the original parameters.