|
1 |
| -def postprocess(extracted_texts: list[str]) -> str: |
| 1 | +def postprocess(extracted_texts: list[str], page_labels: list[str]) -> str: |
2 | 2 | """Pass a list of all extracted texts from all pages."""
|
3 | 3 | extracted_texts = [replace_ligatures(t) for t in extracted_texts]
|
4 | 4 | extracted_texts = [remove_hyphens(t) for t in extracted_texts]
|
5 |
| - # footer_remover = FooterRemover() |
6 |
| - # footer_remover.fit(extracted_texts) |
7 |
| - # extracted_texts = [footer_remover.extract(t) for t in extracted_texts] |
8 |
| - |
| 5 | + extracted_texts = remove_footer(extracted_texts, page_labels) |
9 | 6 | return "\n".join(extracted_texts)
|
10 | 7 |
|
11 | 8 |
|
@@ -52,32 +49,20 @@ def dehyphenate(lines: list[str], line_no: int) -> list[str]:
|
52 | 49 | return lines
|
53 | 50 |
|
54 | 51 |
|
55 |
| -class FooterRemover: |
56 |
| - def __init__(self): |
57 |
| - self.footer = None |
| 52 | +def remove_footer(extracted_texts: list[str], page_labels: list[str]): |
| 53 | + def remove_page_labels(extracted_texts, page_labels): |
| 54 | + processed = [] |
| 55 | + for text, label in zip(extracted_texts, page_labels): |
| 56 | + text_left = text.lstrip() |
| 57 | + if text_left.startswith(label): |
| 58 | + text = text_left[len(label) :] |
58 | 59 |
|
59 |
| - def fit(self, extracted_texts: list[str]) -> None: |
60 |
| - """ |
61 |
| - Find the common footer by comparing all extracted texts |
62 |
| - and finding the common suffix. |
63 |
| - We assume that the footer appears at the end of each text. |
64 |
| - """ |
65 |
| - common_suffix = None |
66 |
| - for text in extracted_texts: |
67 |
| - if common_suffix is None: |
68 |
| - common_suffix = text |
69 |
| - else: |
70 |
| - i = 1 |
71 |
| - while i <= min(len(common_suffix), len(text)): |
72 |
| - if common_suffix[-i:] != text[-i:]: |
73 |
| - break |
74 |
| - i += 1 |
75 |
| - common_suffix = common_suffix[-(i - 1) :] |
| 60 | + text_right = text.rstrip() |
| 61 | + if text_right.endswith(label): |
| 62 | + text = text_right[: -len(label)] |
76 | 63 |
|
77 |
| - self.footer = common_suffix |
| 64 | + processed.append(text) |
| 65 | + return processed |
78 | 66 |
|
79 |
| - def extract(self, extracted_text: str) -> str: |
80 |
| - """Remove the detected footer from the extracted text.""" |
81 |
| - if self.footer is not None and extracted_text.endswith(self.footer): |
82 |
| - return extracted_text[: -len(self.footer)] |
83 |
| - return extracted_text |
| 67 | + extracted_texts = remove_page_labels(extracted_texts, page_labels) |
| 68 | + return extracted_texts |
0 commit comments