Skip to content

Commit 97ea77f

Browse files
Aniela Amymarklogic-builder
authored andcommitted
DHFPROD-10507: in-memory fuzzy match
1 parent b9f03c9 commit 97ea77f

File tree

4 files changed

+103
-75
lines changed

4 files changed

+103
-75
lines changed

marklogic-data-hub/src/main/resources/ml-modules/root/data-hub/5/mastering/matching/matchable.mjs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,9 @@ class MatchRulesetDefinition {
404404
}
405405
}
406406
}
407+
if (matchingTraceEnabled) {
408+
xdmp.trace(matchingTraceEvent, `Excluded values: ${xdmp.toJsonString([...this.excludedValues])}`);
409+
}
407410
}
408411

409412
name() {
@@ -422,7 +425,7 @@ class MatchRulesetDefinition {
422425
if (!matchRule._valueFunction) {
423426
const pathKey = matchRule.documentXPath || matchRule.entityPropertyPath;
424427
matchRule._valueFunction = (contentObject) => {
425-
const key = `${contentObject.uri}:${pathKey}`;
428+
const key = `${contentObject.uri}:${this.exclusionListNames.sort().join(":")}:${pathKey}`;
426429
if (!cachedPropertyValues.has(key)) {
427430
let values;
428431
if (matchRule.documentXPath) {
@@ -489,7 +492,7 @@ class MatchRulesetDefinition {
489492
let matchRule = passMatchRule.toObject();
490493
let dictionary = matchRule.options.dictionaryURI;
491494
let spellOption = {
492-
distanceThreshold: matchRule.options.distanceThreshold
495+
distanceThreshold: matchRule.options.distanceThreshold || 100
493496
};
494497
let results;
495498
try {
@@ -680,7 +683,7 @@ class MatchRulesetDefinition {
680683
if (query) {
681684
hashes = [...matchingXqy.queryToHashes(query, this.fuzzyMatch())];
682685
}
683-
this._cachedQueryHashes.set(uri, new Set(hashes));
686+
this._cachedQueryHashes.set(uri, hashes);
684687
}
685688
return this._cachedQueryHashes.get(uri);
686689
}

marklogic-data-hub/src/main/resources/ml-modules/root/data-hub/5/mastering/matching/matcher.mjs

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -57,23 +57,24 @@ function gatherThresholdQueryFunctions(thresholdDefinitions) {
5757

5858
function addHashesToTripleArray(contentObject, matchRulesetDefinitions, triplesByUri, inMemoryTriples) {
5959
for (const matchRuleset of matchRulesetDefinitions) {
60-
const queryHashes = matchRuleset.queryHashes(contentObject, matchRuleset.fuzzyMatch());
60+
const queryHashes = matchRuleset.queryHashes(contentObject);
6161
for (const queryHash of queryHashes) {
6262
let uriTriples = triplesByUri.get(contentObject.uri);
6363
if (!uriTriples) {
6464
uriTriples = [];
6565
triplesByUri.set(contentObject.uri, uriTriples);
6666
}
67-
const uriToHashTriple = sem.triple(contentObject.uri, queryHashPredicate, queryHash);
68-
const hashToRulesetTriple = sem.triple(queryHash, hashBelongToPredicate, matchRuleset.name());
67+
const uriToHashTriple = sem.triple(contentObject.uri, queryHashPredicate, queryHash, fuzzyMatchHashesCollection);
68+
const hashToRulesetTriple = sem.triple(queryHash, hashBelongToPredicate, matchRuleset.name(), fuzzyMatchHashesCollection);
6969
inMemoryTriples.push(uriToHashTriple, hashToRulesetTriple);
70-
uriTriples.push(uriToHashTriple, hashToRulesetTriple);
70+
//uriTriples.push(uriToHashTriple, hashToRulesetTriple);
7171
}
7272
}
7373
}
7474

7575
function getMatchingURIs(matchable, contentObject, baselineQuery, filterQuery, thresholdQueryFunctions) {
7676
let allMatchingBatchUris = [];
77+
7778
for (const thresholdQueryFunction of thresholdQueryFunctions) {
7879
const thresholdQuery = thresholdQueryFunction(contentObject);
7980
if (!thresholdQuery) {
@@ -273,37 +274,37 @@ function addHashMatchesToMatchSummary(matchable, matchSummary, uris, inMemoryTri
273274
?originalUri <http://marklogic.com/data-hub/mastering#hasMatchingHash> ?uriHash.
274275
FILTER (?matchingUri = $uris)
275276
}`, {uris}, [], [sem.inMemoryStore(inMemoryTriples), sem.store(["document"], cts.collectionQuery(fuzzyMatchHashesCollection))]).toArray().reduce((hashMatches, triple) => {
276-
let {originalUri, matchingUri, matchRuleset} = triple;
277-
originalUri = fn.string(originalUri), matchingUri = fn.string(matchingUri), matchRuleset = fn.string(matchRuleset);
278-
let currentHashMatch = hashMatches.get(originalUri);
279-
if (!currentHashMatch) {
280-
currentHashMatch = {matches: new Map()};
281-
hashMatches.set(originalUri, currentHashMatch);
282-
}
283-
const uriMatches = currentHashMatch.matches;
284-
if (matchingUri === originalUri) {
277+
let {originalUri, matchingUri, matchRuleset} = triple;
278+
originalUri = fn.string(originalUri), matchingUri = fn.string(matchingUri), matchRuleset = fn.string(matchRuleset);
279+
let currentHashMatch = hashMatches[originalUri];
280+
if (!currentHashMatch) {
281+
currentHashMatch = {matches: {}};
282+
hashMatches[originalUri] = currentHashMatch;
283+
}
284+
const uriMatches = currentHashMatch.matches;
285+
if (matchingUri === originalUri) {
286+
return hashMatches;
287+
}
288+
let match = uriMatches[matchingUri];
289+
if (!match) {
290+
match = {matchedRulesets: []};
291+
uriMatches[matchingUri] = match;
292+
}
293+
match.matchedRulesets.push(matchRuleset);
285294
return hashMatches;
286-
}
287-
let match = uriMatches.get(matchingUri);
288-
if (!match) {
289-
match = {matchedRulesets: []};
290-
uriMatches.set(matchingUri, match);
291-
}
292-
match.matchedRulesets.push(matchRuleset);
293-
return hashMatches;
294-
}, new Map());
295-
populateContentObjects(results.keys());
296-
for (const resultEntry of results.entries()) {
297-
const matchUri = resultEntry[0];
298-
const matches = resultEntry[1];
295+
}, {});
296+
const contentUris = Object.keys(results);
297+
populateContentObjects(contentUris);
298+
for (const matchUri of contentUris) {
299+
const matches = results[matchUri];
299300
const currentContentObject = getContentObject(matchUri);
300301
if (!currentContentObject) {
301302
continue;
302303
}
303304
const groupByThreshold = {};
304-
for (const matchesEntry of matches.matches.entries()) {
305-
const matchedUri = matchesEntry[0];
306-
const match = matchesEntry[1];
305+
const matchedUris = Object.keys(matches.matches);
306+
for (const matchedUri of matchedUris) {
307+
const match = matches.matches[matchedUri];
307308
if (!(match && match.matchedRulesets)) {
308309
continue;
309310
}
@@ -341,7 +342,7 @@ function addHashMatchesToMatchSummary(matchable, matchSummary, uris, inMemoryTri
341342
continue;
342343
}
343344
const thresholdDefinition = thresholdDefinitionsByName[thresholdName];
344-
const matchDocSet = thresholdMatches.map((uri) => getContentObject(uri)).filter((content) => {
345+
const matchDocSet = [matchUri, ...thresholdMatches].map((uri) => getContentObject(uri)).filter((content) => {
345346
return content && (matchable.scoreDocument(currentContentObject, content) >= thresholdDefinition.score());
346347
});
347348
thresholdMatches = matchDocSet.map(content => content.uri);

marklogic-data-hub/src/main/resources/ml-modules/root/data-hub/data-services/mastering/previewMatchingActivity.mjs

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ const {populateContentObjects, getContentObject} = common;
2727
const previewMatchingActivityLib = require("/data-hub/5/mastering/preview-matching-activity-lib.xqy");
2828

2929
const stepName = external.stepName;
30-
const uris = external.uris;
31-
const sampleSize = external.sampleSize;
30+
const sampleSize = external.sampleSize || 100;
3231
const restrictToUris = external.restrictToUris;
32+
const uris = external.uris && (restrictToUris || external.uris.length > 0) ? external.uris: null;
3333
const nonMatches = external.nonMatches;
3434

3535
xdmp.securityAssert("http://marklogic.com/data-hub/privileges/read-match-merge", "execute");
@@ -50,38 +50,51 @@ let resultFunction = function() {
5050
score: minMatchRuleWeight
5151
});
5252
}
53-
const urisSelection = uris ? uris: cts.uris(null, ["score-zero", "concurrent", `limit=${sampleSize}`], sourceQuery, 0);
54-
populateContentObjects(uris);
55-
const urisQuery = cts.documentQuery(urisSelection);
56-
57-
if (restrictToUris) {
58-
step.filterQuery = step.filterQuery ? cts.andQuery([cts.query(step.filterQuery), urisQuery]) : urisQuery;
59-
}
60-
const matchable = new Matchable(step);
61-
const output = [];
62-
const content = Sequence.from(hubUtils.queryToContentDescriptorArray(urisQuery));
63-
const results = fn.exists(content) ? matcher.buildMatchSummary(matchable, content)[0]: {matchSummary: {actionDetails: {}}};
53+
6454
const allUris = new Set();
55+
const encounteredPairs = new Set();
56+
const matchable = new Matchable(step);
57+
6558
let pairCount = 0;
66-
for (const [actionUri, actionDetails] of Object.entries(results.matchSummary.actionDetails)) {
67-
if (nonMatches && actionDetails.thresholdName !== "Not Matched") {
68-
continue;
59+
const output = [];
60+
const maxPages = uris && uris.length >= 0 ? 1: 5;
61+
const originalFilterQuery = step.filterQuery;
62+
for (let i = 0; i < maxPages; i++) {
63+
const urisSelection = uris ? uris: cts.uris(null, ["score-zero", "concurrent", `skip=${sampleSize * i}`, `truncate=${sampleSize}`], sourceQuery, 0);
64+
if (!urisSelection || fn.empty(urisSelection)) {
65+
break;
6966
}
70-
const uris = actionDetails.uris;
71-
uris.forEach(uri => allUris.add(uri));
72-
populateContentObjects(uris);
73-
const referenceMatchResult = actionDetails.matchResults.find(matchingResult => matchingResult.score === "referenceDocument");
74-
if (referenceMatchResult && referenceMatchResult.uri) {
75-
const comparingURI = referenceMatchResult.uri;
67+
populateContentObjects(urisSelection);
68+
const urisQuery = cts.documentQuery(urisSelection);
69+
70+
if (restrictToUris) {
71+
step.filterQuery = originalFilterQuery ? cts.andQuery([cts.query(originalFilterQuery), urisQuery]) : urisQuery;
72+
}
73+
const content = Sequence.from(hubUtils.queryToContentDescriptorArray(urisQuery));
74+
const results = fn.exists(content) ? matcher.buildMatchSummary(matchable, content)[0]: {matchSummary: {actionDetails: {}}};
75+
for (const [actionUri, actionDetails] of Object.entries(results.matchSummary.actionDetails)) {
76+
if (nonMatches && actionDetails.thresholdName !== "Not Matched") {
77+
continue;
78+
}
79+
const uris = actionDetails.uris;
80+
uris.forEach(uri => allUris.add(uri));
81+
populateContentObjects(uris);
82+
let comparingUri = actionDetails.uris[0];
83+
const referenceMatchResult = actionDetails.matchResults.find(matchingResult => matchingResult.score === "referenceDocument");
84+
if (referenceMatchResult && referenceMatchResult.uri) {
85+
comparingUri = referenceMatchResult.uri;
86+
}
7687
for (const matchingResult of actionDetails.matchResults) {
77-
if (matchingResult.uri === comparingURI) {
88+
const pairKey = [matchingResult.uri, comparingUri].sort().join(":");
89+
if (matchingResult.uri === comparingUri || encounteredPairs.has(pairKey)) {
7890
continue;
7991
}
92+
encounteredPairs.add(pairKey);
8093
output.push({
8194
name: actionDetails.thresholdName,
8295
action: actionDetails.action,
8396
score: matchingResult.score,
84-
uris: [comparingURI, matchingResult.uri],
97+
uris: [comparingUri, matchingResult.uri],
8598
matchRulesets: matchingResult.matchedRulesets.map(matched => matched.rulesetName)
8699
});
87100
if (++pairCount === sampleSize) {
@@ -92,6 +105,9 @@ let resultFunction = function() {
92105
break;
93106
}
94107
}
108+
if (pairCount >= sampleSize) {
109+
break;
110+
}
95111
}
96112
return {
97113
sampleSize,
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,17 @@
11
const test = require("/test/test-helper.xqy");
22

3-
const pma = require("/data-hub/5/mastering/preview-matching-activity-lib.xqy");
3+
const pma = {
4+
previewMatchingActivity: (stepName, allUris, restrictToUris, sampleSize = 100) => {
5+
const results = fn.head(xdmp.invoke("/data-hub/data-services/mastering/previewMatchingActivity.mjs", {
6+
stepName,
7+
uris: allUris,
8+
sampleSize,
9+
restrictToUris,
10+
nonMatches: false
11+
}));
12+
return results;
13+
}
14+
};
415

516
let allUris = [
617
"/content/CustMatchMerge1.json",
@@ -11,51 +22,48 @@ let allUris = [
1122
"/content/CustShippingCityStateMatch4.json"
1223
];
1324

14-
const options = cts.doc("/steps/matching/matchCustomers.step.json").root;
15-
const sourceQuery = xdmp.eval(options.sourceQuery);
16-
1725
const results = [];
1826

1927
const verifyPrimaryKeys = function(result) {
20-
const resultObj = result.toObject();
2128
const allURIs = [];
22-
for (let action of resultObj.actionPreview) {
29+
for (let action of result.actionPreview) {
2330
for (let uri of action.uris) {
2431
if (!allURIs.includes(allURIs)) {
2532
allURIs.push(uri);
26-
results.push(test.assertEqual(fn.string(cts.doc(uri).toObject().envelope.instance.Customer.customerId) || uri, resultObj.primaryKeys[uri], `Unexpected primary key value! primaryKeys: ${xdmp.toJsonString(resultObj.primaryKeys)}`));
33+
results.push(test.assertEqual(fn.string(cts.doc(uri).toObject().envelope.instance.Customer.customerId) || uri, result.primaryKeys[uri], `Unexpected primary key value! primaryKeys: ${xdmp.toJsonString(result.primaryKeys)}`));
2734
}
2835
}
2936
}
3037
};
3138

32-
let allUrisResults = pma.previewMatchingActivity(options, sourceQuery, allUris, false, 0);
39+
let allUrisResults = pma.previewMatchingActivity("matchCustomers", allUris, false);
3340
verifyPrimaryKeys(allUrisResults);
34-
results.push(test.assertEqual(7, allUrisResults.actionPreview.length, "There should be 7 matching pairs"));
41+
results.push(test.assertEqual(6, allUrisResults.actionPreview.length, `There should be 6 matching pairs. ${xdmp.toJsonString(allUrisResults)}`));
3542

3643
let uris0_1 = [allUris[0] , allUris[1]];
37-
let uris_0_1_Results = pma.previewMatchingActivity(options, sourceQuery, uris0_1, false, 0);
44+
let uris_0_1_Results = pma.previewMatchingActivity("matchCustomers", uris0_1, false);
3845
verifyPrimaryKeys(uris_0_1_Results);
3946
results.push(test.assertEqual(2, uris_0_1_Results.uris.length, "There should be 2 URIs in the response's URIs array for match on docs 0 and 1"));
4047
results.push(test.assertEqual(1, uris_0_1_Results.actionPreview.length, "There should be only one matching pair for match on docs 0 and 1"));
4148

4249
let uris2_3 = [allUris[2] , allUris[3]];
43-
let uris_2_3_Results = pma.previewMatchingActivity(options, sourceQuery, uris2_3, false, 0);
50+
let uris_2_3_Results = pma.previewMatchingActivity("matchCustomers", uris2_3, false);
4451
verifyPrimaryKeys(uris_2_3_Results);
4552
results.push(test.assertEqual(2, uris_2_3_Results.uris.length, "There should be 2 URIs in the response's URIs array for match on docs 2 and 3"));
46-
results.push(test.assertEqual(5, uris_2_3_Results.actionPreview.length, `There should be 5 matching pairs for match on docs 2 and 3. actionPreview: ${xdmp.toJsonString(uris_2_3_Results.actionPreview)}`));
47-
results.push(test.assertEqual("7.5", uris_2_3_Results.actionPreview[0].score.toString(), "For match with docs 1 and 2, the first score should be 7.5 for the match between 2 and 3"));
48-
results.push(test.assertEqual("9.5", uris_2_3_Results.actionPreview[1].score.toString(), "For match with docs 1 and 3, the 2nd score should be 9.5"));
49-
results.push(test.assertEqual("7.5", uris_2_3_Results.actionPreview[4].score.toString(), "For match with docs 2 and 3, the last score should be 7.5"));
53+
results.push(test.assertEqual(4, uris_2_3_Results.actionPreview.length, `There should be 4 matching pairs for match on docs 2 and 3. actionPreview: ${xdmp.toJsonString(uris_2_3_Results.actionPreview)}`));
54+
results.push(test.assertEqual("9.5", uris_2_3_Results.actionPreview[0].score.toString(), "For match with docs 2 and 4, the first score should be 9.5 for the match between 2 and 4"));
55+
results.push(test.assertEqual("7.5", uris_2_3_Results.actionPreview[1].score.toString(), "For match with docs 1 and 2, the 2nd score should be 7.5"));
56+
results.push(test.assertEqual("7.5", uris_2_3_Results.actionPreview[2].score.toString(), "For match with docs 1 and 4, the last score should be 7.5"));
57+
results.push(test.assertEqual("7.5", uris_2_3_Results.actionPreview[2].score.toString(), "For match with docs 1 and 3, the last score should be 7.5"));
5058

51-
let uris_2_3_ResultRestrictedToURIs = pma.previewMatchingActivity(options, sourceQuery, uris2_3, true, 0);
59+
let uris_2_3_ResultRestrictedToURIs = pma.previewMatchingActivity("matchCustomers", uris2_3, true);
5260
verifyPrimaryKeys(uris_2_3_ResultRestrictedToURIs);
5361
results.push(test.assertEqual(2, uris_2_3_ResultRestrictedToURIs.uris.length, "There should be 2 URIs in the response's URIs array for match on docs 2 and 3"));
5462
results.push(test.assertEqual(1, uris_2_3_ResultRestrictedToURIs.actionPreview.length, `There should be 1 matching pair for match on docs 2 and 3 when not including the entire dataset. actionPreview: ${xdmp.toJsonString(uris_2_3_ResultRestrictedToURIs.actionPreview)}`));
5563
results.push(test.assertEqual("7.5", uris_2_3_ResultRestrictedToURIs.actionPreview[0].score.toString(), "For match with docs 2 and 3, the last score should be 7.5"));
5664

57-
let sampleResults = pma.previewMatchingActivity(options, sourceQuery, [], false, 3);
65+
let sampleResults = pma.previewMatchingActivity("matchCustomers", [], false, 3);
5866
verifyPrimaryKeys(sampleResults);
59-
results.push(test.assertEqual(3, sampleResults.uris.length, "There should be 3 URIs in the response's URIs array when sampleSize is 3"));
67+
results.push(test.assertEqual(3, sampleResults.actionPreview.length, `There should be 3 pairs in the response's URIs array when sampleSize is 3. ${xdmp.toJsonString(sampleResults)}`));
6068

6169
results;

0 commit comments

Comments
 (0)