batmanlab.github.io/update_publication_links.py at main · batmanlab/batmanlab.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/env python3
"""
Script to update publication links in index.html based on publication_link.json
"""

import json
import re
from pathlib import Path

# Load the JSON file with correct links
json_path = Path("/Users/kayhan/Downloads/publication_link.json")
html_path = Path("/Users/kayhan/Documents/Projects/newWebSite/index.html")

with open(json_path, 'r', encoding='utf-8') as f:
    publications_data = json.load(f)

# Read the HTML file
with open(html_path, 'r', encoding='utf-8') as f:
    html_content = f.read()

def normalize_title(title):
    """Normalize title for matching"""
    # Remove HTML entities
    title = title.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    # Remove extra whitespace
    title = re.sub(r'\s+', ' ', title.strip())
    return title.lower()

def find_publication_in_json(title, year):
    """Find matching publication in JSON data"""
    year_str = str(year)
    if year_str not in publications_data:
        return None

    normalized_title = normalize_title(title)

    # First try exact match
    for pub in publications_data[year_str]:
        if normalize_title(pub['title']) == normalized_title:
            return pub

    # Try fuzzy matching - check if significant words match
    title_words = set(normalized_title.split())
    if len(title_words) < 3:
        return None

    best_match = None
    best_score = 0

    for pub in publications_data[year_str]:
        pub_title_norm = normalize_title(pub['title'])
        pub_words = set(pub_title_norm.split())

        if len(pub_words) == 0:
            continue

        # Calculate overlap
        overlap = len(title_words & pub_words)
        total_words = min(len(title_words), len(pub_words))

        if total_words > 0:
            score = overlap / total_words
            if score > best_score and score > 0.6:  # At least 60% word overlap
                best_score = score
                best_match = pub

    return best_match

def build_meta_links_html(paper_link, preprint_link, code_link, project_link):
    """Build the meta-links HTML string"""
    links = []

    if paper_link and paper_link.strip():
        links.append(f'<a href="{paper_link}"><i class="bi bi-file-earmark-text"></i> Paper</a>')

    if preprint_link and preprint_link.strip():
        links.append(f'<a href="{preprint_link}"><i class="bi bi-cloud-download"></i> Preprint</a>')

    if code_link and code_link.strip():
        links.append(f'<a href="{code_link}"><i class="bi bi-github"></i> Code</a>')

    if project_link and project_link.strip():
        links.append(f'<a href="{project_link}"><i class="bi bi-box-arrow-up-right"></i> Project</a>')

    if links:
        return f'<div class="meta-links mb-2">{" ".join(links)}</div>'
    return ''

# First, remove all duplicate meta-links divs
# Find all meta-links and keep only the first one in each article
def remove_duplicate_meta_links(content):
    """Remove duplicate meta-links divs from articles"""
    # Pattern to match article with potential duplicate meta-links
    article_pattern = r'(<article class="item-row">.*?</article>)'

    def clean_article(match):
        article = match.group(1)
        # Find all meta-links divs
        meta_pattern = r'(<div class="meta-links mb-2">.*?</div>)'
        meta_matches = list(re.finditer(meta_pattern, article, flags=re.DOTALL))

        if len(meta_matches) <= 1:
            return article  # No duplicates

        # Keep only the first one, remove the rest
        result = article
        # Replace from end to start to avoid position shifting
        for i in range(len(meta_matches) - 1, 0, -1):
            meta_match = meta_matches[i]
            # Remove this duplicate (including any leading whitespace/newlines)
            start = meta_match.start()
            end = meta_match.end()
            # Also remove preceding whitespace/newlines
            while start > 0 and result[start-1] in ' \n\t':
                start -= 1
            result = result[:start] + result[end:]

        return result

    return re.sub(article_pattern, clean_article, content, flags=re.DOTALL)

# Remove duplicates first
html_content = remove_duplicate_meta_links(html_content)

# Now process each article to update links
# Find all year sections
year_sections = list(re.finditer(r'<h4 class="mt-4 mb-3 fw-bold">(\d{4})</h4>', html_content))

if not year_sections:
    print("No year sections found!")
    exit(1)

# Process from end to start to avoid position shifting
result_content = html_content

for i in range(len(year_sections) - 1, -1, -1):
    year_match = year_sections[i]
    year = year_match.group(1)
    year_start = year_match.start()

    # Find the end of this year's section
    if i < len(year_sections) - 1:
        section_end = year_sections[i + 1].start()
    else:
        # Last year - find end of publications section
        section_end_match = re.search(r'</section>', result_content[year_start:])
        if section_end_match:
            section_end = year_start + section_end_match.start()
        else:
            section_end = len(result_content)

    year_section = result_content[year_start:section_end]

    # Process articles in this year
    article_pattern = r'(<article class="item-row">.*?<h5 class="mb-1 fw-bold">(.*?)</h5>.*?<div class="mb-1"><span class="fw-semibold">Venue:</span>.*?</div>)(.*?)(</article>)'

    def replace_article(match):
        before_meta = match.group(1)
        article_middle = match.group(3)
        article_end = match.group(4)

        # Extract title
        title_match = re.search(r'<h5 class="mb-1 fw-bold">(.*?)</h5>', before_meta)
        if not title_match:
            return match.group(0)

        title = title_match.group(1)
        title_clean = re.sub(r'<[^>]+>', '', title).strip()

        # Find matching publication
        pub_data = find_publication_in_json(title_clean, year)

        if pub_data:
            # Build new meta links
            new_meta = build_meta_links_html(
                pub_data.get('paper_link', ''),
                pub_data.get('preprint_link', ''),
                pub_data.get('code_link', ''),
                pub_data.get('project_link', '')
            )

            if new_meta:
                # Remove any existing meta-links from article_middle
                article_middle_clean = re.sub(r'<div class="meta-links mb-2">.*?</div>', '', article_middle, flags=re.DOTALL)
                # Add new meta links
                return before_meta + '\n                    ' + new_meta + article_middle_clean + article_end

        # No match - return original (but clean up any duplicates)
        article_middle_clean = article_middle
        meta_count = len(re.findall(r'<div class="meta-links mb-2">', article_middle))
        if meta_count > 1:
            # Keep only the first one
            parts = re.split(r'(<div class="meta-links mb-2">.*?</div>)', article_middle, flags=re.DOTALL, maxsplit=1)
            if len(parts) >= 2:
                article_middle_clean = parts[0] + parts[1] + ''.join(parts[2:])

        return before_meta + article_middle_clean + article_end

    # Replace articles in this section
    updated_section = re.sub(article_pattern, replace_article, year_section, flags=re.DOTALL)

    # Replace the section in the main content
    result_content = result_content[:year_start] + updated_section + result_content[section_end:]

# Write back to file
with open(html_path, 'w', encoding='utf-8') as f:
    f.write(result_content)

print(f"Updated publication links in {html_path}")
print("Please review the changes before committing.")