Skip to content

Commit 9ffbb97

Browse files
authored
Merge pull request #10034 from bhandras/fuzzy-matching-script
scripts: add cherry-pick verification tool with fuzzy matching
2 parents b5c290d + cabb3a0 commit 9ffbb97

File tree

1 file changed

+252
-0
lines changed

1 file changed

+252
-0
lines changed
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
#!/usr/bin/env bash
2+
3+
# Disabled `set -euo pipefail` to prevent premature exit on Linux due to
4+
# process substitution failures. Some commands (e.g. `diff <(...) <(...)`) can
5+
# fail if input is empty or pipes break, which is tolerated logic in this
6+
# script. macOS handles these cases more gracefully, but GNU diff in Linux does
7+
# not - leading to hard script exits mid-match.
8+
#
9+
# set -euo pipefail
10+
11+
SRC_BRANCH=""
12+
RELEASE_BRANCH=""
13+
SRC_SCAN_LIMIT=1000
14+
RELEASE_LIMIT=0
15+
16+
show_help() {
17+
echo ""
18+
echo "🔍 fuzzy-match-release-branch.sh"
19+
echo ""
20+
echo " Compares commits in a release branch to those in a source branch (e.g. master) and identifies"
21+
echo " cherry-picked commits based on patch equivalence or fuzzy metadata (subject, author, date)."
22+
echo ""
23+
echo " ❓ Use this to:"
24+
echo " - Audit cherry-picks in release branches"
25+
echo " - Detect missing or altered backports"
26+
echo " - Spot accidental omissions during cherry-pick workflows"
27+
echo ""
28+
echo " 📦 Usage:"
29+
echo " $0 --source <branch> --release <branch> [--scan-limit N] [--limit N]"
30+
echo ""
31+
echo " 🔧 Options:"
32+
echo " --source Source branch where original commits exist (e.g. master)"
33+
echo " --release Release branch to check for matching cherry-picks"
34+
echo " --scan-limit Max commits to scan in source branch (default: 1000)"
35+
echo " --limit Number of release commits to compare (default: all)"
36+
echo ""
37+
echo " 🧪 Example: Find the closest matches for the last 92 commits in 0-19-2-branch-rc2 from master (scanning up to 300 commits):"
38+
echo ""
39+
echo " ./scripts/fuzzy-match-release-branch.sh --source master --release 0-19-2-branch-rc2 --limit 92 --scan-limit 300"
40+
echo ""
41+
echo " 📝 Notes:"
42+
echo " - Requires git history for both branches to be present locally"
43+
echo " - Patch comparison is normalized (removes index lines, trims whitespace)"
44+
echo " - Fuzzy matching uses subject + author + date if no exact patch match found"
45+
echo ""
46+
exit 1
47+
}
48+
49+
normalize_patch() {
50+
sed '/^index [0-9a-f]\{7,\}\.\.[0-9a-f]\{7,\} [0-9]\{6\}$/d'
51+
}
52+
53+
# Parse args
54+
while [[ $# -gt 0 ]]; do
55+
case "$1" in
56+
--source|--release|--scan-limit|--limit)
57+
if [[ -z "${2:-}" || "$2" =~ ^- ]]; then
58+
echo "Error: Missing value for argument $1" >&2
59+
show_help
60+
fi
61+
case "$1" in
62+
--source) SRC_BRANCH="$2" ;;
63+
--release) RELEASE_BRANCH="$2" ;;
64+
--scan-limit) SRC_SCAN_LIMIT="$2" ;;
65+
--limit) RELEASE_LIMIT="$2" ;;
66+
esac
67+
shift 2
68+
;;
69+
-h|--help) show_help ;;
70+
*) echo "Unknown argument: $1"; show_help ;;
71+
esac
72+
done
73+
74+
if [[ -z "$SRC_BRANCH" || -z "$RELEASE_BRANCH" ]]; then
75+
echo "❌ Missing required arguments."; show_help
76+
fi
77+
78+
# Cross-platform hashing
79+
hash_patch() {
80+
if command -v md5sum >/dev/null 2>&1; then
81+
md5sum | awk '{print $1}'
82+
else
83+
md5 | awk '{print $NF}'
84+
fi
85+
}
86+
87+
echo "🔍 Preparing comparison:"
88+
echo " Source branch : $SRC_BRANCH"
89+
echo " Release branch : $RELEASE_BRANCH"
90+
echo " Max source scan: $SRC_SCAN_LIMIT"
91+
echo " Max release compare: $([[ $RELEASE_LIMIT -gt 0 ]] && echo \"$RELEASE_LIMIT\" || echo \"ALL\")"
92+
echo ""
93+
94+
echo "🔄 Fetching latest refs..."
95+
git fetch --all --quiet || true
96+
97+
echo "📥 Collecting release commits..."
98+
RELEASE_COMMITS=$(git rev-list --no-merges "$RELEASE_BRANCH" ^"$SRC_BRANCH")
99+
if [[ "$RELEASE_LIMIT" -gt 0 ]]; then
100+
RELEASE_COMMITS=$(echo "$RELEASE_COMMITS" | head -n "$RELEASE_LIMIT")
101+
fi
102+
RELEASE_COMMITS=$(echo "$RELEASE_COMMITS" | awk '{ lines[NR] = $0 } END { for (i = NR; i > 0; i--) print lines[i] }')
103+
RELEASE_COMMITS_ARRAY=()
104+
while IFS= read -r line; do
105+
[[ -n "$line" ]] && RELEASE_COMMITS_ARRAY+=("$line")
106+
done <<< "$RELEASE_COMMITS"
107+
echo " → Found ${#RELEASE_COMMITS_ARRAY[@]} release commits."
108+
109+
if [[ "${#RELEASE_COMMITS_ARRAY[@]}" -eq 0 ]]; then
110+
echo "❌ No release commits found. Exiting."
111+
exit 1
112+
fi
113+
114+
echo "📥 Collecting source commits..."
115+
SRC_COMMITS=$(git rev-list --no-merges --max-count="$SRC_SCAN_LIMIT" "$SRC_BRANCH")
116+
SRC_COMMITS_ARRAY=()
117+
while IFS= read -r line; do
118+
[[ -n "$line" ]] && SRC_COMMITS_ARRAY+=("$line")
119+
done <<< "$SRC_COMMITS"
120+
echo " → Found ${#SRC_COMMITS_ARRAY[@]} source commits to scan."
121+
echo ""
122+
123+
echo "⚙️ Indexing source commit metadata..."
124+
echo " → Processing ${#SRC_COMMITS_ARRAY[@]} commits from $SRC_BRANCH..."
125+
SRC_COMMIT_META=()
126+
SRC_PATCH_HASHES=()
127+
SRC_PATCHES=()
128+
129+
progress=0
130+
for commit in "${SRC_COMMITS_ARRAY[@]}"; do
131+
progress=$((progress + 1))
132+
echo -ne "\r [$progress/${#SRC_COMMITS_ARRAY[@]}] Indexing $commit"
133+
author=$(git log -1 --pretty=format:"%an <%ae>" "$commit" 2>/dev/null) || continue
134+
subject=$(git log -1 --pretty=format:"%s" "$commit" 2>/dev/null) || continue
135+
authordate=$(git log -1 --pretty=format:"%ai" "$commit" 2>/dev/null) || continue
136+
meta_key="${subject}__${author}__${authordate}"
137+
patch=$(git show --format= --unified=3 "$commit" | normalize_patch | sed 's/^[[:space:]]*//')
138+
patch_hash=$(echo "$patch" | hash_patch)
139+
140+
SRC_COMMIT_META+=("$meta_key")
141+
SRC_PATCH_HASHES+=("$patch_hash")
142+
SRC_PATCHES+=("$patch")
143+
done
144+
145+
echo -e "\n → Completed source indexing."
146+
147+
TOTAL=${#RELEASE_COMMITS_ARRAY[@]}
148+
MATCHED=0
149+
UNMATCHED=0
150+
151+
for i in "${!RELEASE_COMMITS_ARRAY[@]}"; do
152+
rc_commit="${RELEASE_COMMITS_ARRAY[$i]}"
153+
rc_author=$(git log -1 --pretty=format:"%an <%ae>" "$rc_commit" 2>/dev/null) || continue
154+
rc_subject=$(git log -1 --pretty=format:"%s" "$rc_commit" 2>/dev/null) || continue
155+
rc_authordate=$(git log -1 --pretty=format:"%ai" "$rc_commit" 2>/dev/null) || continue
156+
meta_key="${rc_subject}__${rc_author}__${rc_authordate}"
157+
158+
echo -ne "[$((i + 1))/$TOTAL] Checking ${rc_commit:0:7}... "
159+
160+
rc_patch=$(git show --format= --unified=3 "$rc_commit" | normalize_patch | sed 's/^[[:space:]]*//')
161+
rc_patch_hash=$(echo "$rc_patch" | hash_patch)
162+
163+
found_exact_index=-1
164+
for j in "${!SRC_PATCH_HASHES[@]}"; do
165+
if [[ "${SRC_PATCH_HASHES[$j]}" == "$rc_patch_hash" ]]; then
166+
found_exact_index=$j
167+
break
168+
fi
169+
done
170+
171+
if [[ $found_exact_index -ne -1 ]]; then
172+
found_exact="${SRC_COMMITS_ARRAY[$found_exact_index]}"
173+
meta_info="${SRC_COMMIT_META[$found_exact_index]}"
174+
src_subject="${meta_info%%__*}"
175+
rest="${meta_info#*__}"
176+
src_author="${rest%%__*}"
177+
src_authordate="${rest##*__}"
178+
echo "✅ MATCHES ${found_exact:0:7}"
179+
echo " ↪ RELEASE: $rc_commit"
180+
echo " Author : $rc_author"
181+
echo " Date : $rc_authordate"
182+
echo " Subject: \"$rc_subject\""
183+
echo " ↪ SOURCE : $found_exact"
184+
echo " Author : $src_author"
185+
echo " Date : $src_authordate"
186+
echo " Subject: \"$src_subject\""
187+
echo ""
188+
MATCHED=$((MATCHED + 1))
189+
continue
190+
fi
191+
192+
echo "❌ NO MATCH"
193+
UNMATCHED=$((UNMATCHED + 1))
194+
195+
echo "🔍 Unmatched Commit:"
196+
echo " ↪ Commit : $rc_commit"
197+
echo " ↪ Author : $rc_author"
198+
echo " ↪ Subject: \"$rc_subject\""
199+
echo ""
200+
201+
best_score=99999
202+
best_index=""
203+
fuzzy_candidates=0
204+
205+
for j in "${!SRC_COMMIT_META[@]}"; do
206+
if [[ "${SRC_COMMIT_META[$j]}" == "$meta_key" ]]; then
207+
((fuzzy_candidates++))
208+
diff=$(diff -u <(echo "$rc_patch") <(echo "${SRC_PATCHES[$j]}") || true)
209+
score=$(echo "$diff" | grep -vE '^(--- |\+\+\+ )' | grep -c '^[-+]')
210+
if [[ "$score" -lt "$best_score" ]]; then
211+
best_score=$score
212+
best_index=$j
213+
fi
214+
fi
215+
done
216+
217+
if [[ "$fuzzy_candidates" -eq 0 ]]; then
218+
echo "⚠️ No commits with matching author + subject + date in source branch."
219+
else
220+
match_commit="${SRC_COMMITS_ARRAY[$best_index]}"
221+
match_author=$(git log -1 --pretty=format:"%an <%ae>" "$match_commit")
222+
match_subject=$(git log -1 --pretty=format:"%s" "$match_commit")
223+
224+
changed_files=$(git show --pretty="" --name-only "$rc_commit")
225+
226+
echo "🤔 Closest fuzzy match: $match_commit ($best_score changed lines from $fuzzy_candidates candidates)"
227+
echo " ↪ Author : $match_author"
228+
echo " ↪ Subject: \"$match_subject\""
229+
echo " ↪ Files Changed:"
230+
echo "$changed_files" | sed 's/^/ - /'
231+
echo ""
232+
233+
echo "🔧 Check it manually (patch diff):"
234+
echo " git diff $match_commit $rc_commit -- \$(git show --pretty=\"\" --name-only $rc_commit)"
235+
echo ""
236+
237+
echo "🔍 Diff between release and closest match:"
238+
echo "---------------------------------------------"
239+
git diff "$match_commit" "$rc_commit" -- $changed_files | sed 's/^/ /' || true
240+
echo "---------------------------------------------"
241+
echo ""
242+
fi
243+
244+
done
245+
246+
# Summary
247+
echo ""
248+
echo "🔎 Summary:"
249+
echo " ✅ Matched : $MATCHED"
250+
echo " ❌ Unmatched : $UNMATCHED"
251+
echo " 📦 Total : $TOTAL"
252+

0 commit comments

Comments
 (0)