From 086c1880ac600e8d4b043fc8206298e9e964081d Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 15 Oct 2025 12:24:21 +0200
Subject: [PATCH 01/21] binarization: add option `--overwrite`, skip existing
 outputs

(also, simplify `run` and separate `run_single`)
---
 src/eynollah/cli.py          |  16 +++++-
 src/eynollah/sbb_binarize.py | 104 +++++++++++++++--------------------
 2 files changed, 56 insertions(+), 64 deletions(-)

diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py
index c9bad52..e4a24e4 100644
--- a/src/eynollah/cli.py
+++ b/src/eynollah/cli.py
@@ -79,18 +79,28 @@ def machine_based_reading_order(input, dir_in, out, model, log_level):
     type=click.Path(file_okay=True, dir_okay=True),
     required=True,
 )
+@click.option(
+    "--overwrite",
+    "-O",
+    help="overwrite (instead of skipping) if output xml exists",
+    is_flag=True,
+)
 @click.option(
     "--log_level",
     "-l",
     type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']),
     help="Override log level globally to this",
 )
-def binarization(patches, model_dir, input_image, dir_in, output, log_level):
+def binarization(patches, model_dir, input_image, dir_in, output, overwrite, log_level):
     assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
     binarizer = SbbBinarizer(model_dir)
     if log_level:
-        binarizer.log.setLevel(getLevelName(log_level))
-    binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in)
+        binarizer.logger.setLevel(getLevelName(log_level))
+    binarizer.run(overwrite=overwrite,
+                  use_patches=patches,
+                  image_path=input_image,
+                  output=output,
+                  dir_in=dir_in)
 
 
 @main.command()
diff --git a/src/eynollah/sbb_binarize.py b/src/eynollah/sbb_binarize.py
index 3716987..0eab2ae 100644
--- a/src/eynollah/sbb_binarize.py
+++ b/src/eynollah/sbb_binarize.py
@@ -25,7 +25,7 @@ class SbbBinarizer:
 
     def __init__(self, model_dir, logger=None):
         self.model_dir = model_dir
-        self.log = logger if logger else logging.getLogger('SbbBinarizer')
+        self.logger = logger if logger else logging.getLogger('SbbBinarizer')
 
         self.start_new_session()
 
@@ -315,64 +315,46 @@ def predict(self, model_in, img, use_patches, n_batch_inference=5):
             prediction_true = prediction_true.astype(np.uint8)
         return prediction_true[:,:,0]
 
-    def run(self, image=None, image_path=None, output=None, use_patches=False, dir_in=None):
-        # print(dir_in,'dir_in')
-        if not dir_in:
-            if (image is not None and image_path is not None) or \
-                (image is None and image_path is None):
-                raise ValueError("Must pass either a opencv2 image or an image_path")
-            if image_path is not None:
-                image = cv2.imread(image_path)
-            img_last = 0
-            for n, (model, model_file) in enumerate(zip(self.models, self.model_files)):
-                self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files)))
-
-                res = self.predict(model, image, use_patches)
-
-                img_fin = np.zeros((res.shape[0], res.shape[1], 3))
-                res[:, :][res[:, :] == 0] = 2
-                res = res - 1
-                res = res * 255
-                img_fin[:, :, 0] = res
-                img_fin[:, :, 1] = res
-                img_fin[:, :, 2] = res
-
-                img_fin = img_fin.astype(np.uint8)
-                img_fin = (res[:, :] == 0) * 255
-                img_last = img_last + img_fin
-
-            kernel = np.ones((5, 5), np.uint8)
-            img_last[:, :][img_last[:, :] > 0] = 255
-            img_last = (img_last[:, :] == 0) * 255
-            if output:
-                cv2.imwrite(output, img_last)
-            return img_last
+    def run(self, image_path=None, output=None, dir_in=None, use_patches=False, overwrite=False):
+        if dir_in:
+            ls_imgs = [(os.path.join(dir_in, image_filename),
+                        os.path.join(output, os.path.splitext(image_filename)[0] + '.png'))
+                       for image_filename in filter(is_image_filename,
+                                                    os.listdir(dir_in))]
         else:
-            ls_imgs = list(filter(is_image_filename, os.listdir(dir_in)))
-            for image_name in ls_imgs:
-                image_stem = image_name.split('.')[0]
-                print(image_name,'image_name')
-                image = cv2.imread(os.path.join(dir_in,image_name) )
-                img_last = 0
-                for n, (model, model_file) in enumerate(zip(self.models, self.model_files)):
-                    self.log.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files)))
-
-                    res = self.predict(model, image, use_patches)
-
-                    img_fin = np.zeros((res.shape[0], res.shape[1], 3))
-                    res[:, :][res[:, :] == 0] = 2
-                    res = res - 1
-                    res = res * 255
-                    img_fin[:, :, 0] = res
-                    img_fin[:, :, 1] = res
-                    img_fin[:, :, 2] = res
-
-                    img_fin = img_fin.astype(np.uint8)
-                    img_fin = (res[:, :] == 0) * 255
-                    img_last = img_last + img_fin
-
-                kernel = np.ones((5, 5), np.uint8)
-                img_last[:, :][img_last[:, :] > 0] = 255
-                img_last = (img_last[:, :] == 0) * 255
-                
-                cv2.imwrite(os.path.join(output, image_stem + '.png'), img_last)
+            ls_imgs = [(image_path, output)]
+
+        for input_path, output_path in ls_imgs:
+            print(input_path, 'image_name')
+            if os.path.exists(output_path):
+                if overwrite:
+                    self.logger.warning("will overwrite existing output file '%s'", output_ptah)
+                else:
+                    self.logger.warning("will skip input for existing output file '%s'", output_path)
+            image = cv2.imread(input_path)
+            result = self.run_single(image, use_patches)
+            cv2.imwrite(output_path, result)
+
+    def run_single(self, image: np.ndarray, use_patches=False):
+        img_last = 0
+        for n, (model, model_file) in enumerate(zip(self.models, self.model_files)):
+            self.logger.info('Predicting with model %s [%s/%s]' % (model_file, n + 1, len(self.model_files)))
+
+            res = self.predict(model, image, use_patches)
+
+            img_fin = np.zeros((res.shape[0], res.shape[1], 3))
+            res[:, :][res[:, :] == 0] = 2
+            res = res - 1
+            res = res * 255
+            img_fin[:, :, 0] = res
+            img_fin[:, :, 1] = res
+            img_fin[:, :, 2] = res
+
+            img_fin = img_fin.astype(np.uint8)
+            img_fin = (res[:, :] == 0) * 255
+            img_last = img_last + img_fin
+
+        kernel = np.ones((5, 5), np.uint8)
+        img_last[:, :][img_last[:, :] > 0] = 255
+        img_last = (img_last[:, :] == 0) * 255
+        return img_last

From 184927fb5488f440948320ca97d716144da5012c Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 13:16:57 +0200
Subject: [PATCH 02/21] `find_num_cols`: re-sort peaks when cutting n-best
 `num_col_classifier`

---
 src/eynollah/utils/__init__.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 5ccb2af..7c47407 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -463,22 +463,19 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
 
     interest_neg_fin = interest_neg[(interest_neg < grenze)]
     peaks_neg_fin = peaks_neg[(interest_neg < grenze)]
-    # interest_neg_fin=interest_neg[(interest_neg<grenze)]
 
     if not tables:
         if ( num_col_classifier - ( (len(interest_neg_fin))+1 ) ) >= 3:
-            index_sort_interest_neg_fin= np.argsort(interest_neg_fin)
-            peaks_neg_sorted = np.array(peaks_neg)[index_sort_interest_neg_fin]
-            interest_neg_fin_sorted = np.array(interest_neg_fin)[index_sort_interest_neg_fin]
-
-            if len(index_sort_interest_neg_fin)>=num_col_classifier:
-                peaks_neg_fin = list( peaks_neg_sorted[:num_col_classifier] )
-                interest_neg_fin = list( interest_neg_fin_sorted[:num_col_classifier] )
-            else:
-                peaks_neg_fin = peaks_neg[:]
-                interest_neg_fin = interest_neg[:]
-
-    num_col = (len(interest_neg_fin)) + 1
+            # found too few columns here: ignore 'grenze' and take the deepest N peaks
+            sort_by_height = np.argsort(interest_neg)[:num_col_classifier]
+            peaks_neg_fin = peaks_neg[sort_by_height]
+            interest_neg_fin = interest_neg[sort_by_height]
+            # print(peaks_neg_fin, "peaks_neg[sorted_by_height]")
+            sort_by_pos = np.argsort(peaks_neg_fin)
+            peaks_neg_fin = peaks_neg_fin[sort_by_pos]
+            interest_neg_fin = interest_neg_fin[sort_by_pos]
+
+    num_col = len(interest_neg_fin) + 1
 
     # print(peaks_neg_fin,'peaks_neg_fin')
     # print(num_col,'diz')

From 48761c3e127bfde488cc3ff6dd7edc97eb85bfd0 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 13:20:12 +0200
Subject: [PATCH 03/21] `find_num_col`: simplify, add better plotting (but
 commented out)

---
 src/eynollah/utils/__init__.py | 212 +++++++++++++++++----------------
 1 file changed, 110 insertions(+), 102 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 7c47407..ce72df4 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -396,16 +396,18 @@ def find_num_col_deskew(regions_without_separators, sigma_, multiplier=3.8):
 def find_num_col(regions_without_separators, num_col_classifier, tables, multiplier=3.8):
     if not regions_without_separators.any():
         return 0, []
-    #plt.imshow(regions_without_separators)
-    #plt.show()
     regions_without_separators_0 = regions_without_separators.sum(axis=0)
-    ##plt.plot(regions_without_separators_0)
-    ##plt.show()
+    # fig, (ax1, ax2) = plt.subplots(2, sharex=True)
+    # ax1.imshow(regions_without_separators, aspect="auto")
+    # ax2.plot(regions_without_separators_0)
+    # plt.show()
     sigma_ = 35  # 70#35
-    meda_n_updown = regions_without_separators_0[len(regions_without_separators_0) :: -1]
+    meda_n_updown = regions_without_separators_0[::-1]
     first_nonzero = next((i for i, x in enumerate(regions_without_separators_0) if x), 0)
     last_nonzero = next((i for i, x in enumerate(meda_n_updown) if x), 0)
     last_nonzero = len(regions_without_separators_0) - last_nonzero
+    last_nonzero = last_nonzero - 100
+    first_nonzero = first_nonzero + 200
     y = regions_without_separators_0  # [first_nonzero:last_nonzero]
     y_help = np.zeros(len(y) + 20)
     y_help[10 : len(y) + 10] = y
@@ -416,28 +418,44 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     z = gaussian_filter1d(y, sigma_)
     zneg = gaussian_filter1d(zneg, sigma_)
 
-    peaks_neg, _ = find_peaks(zneg, height=0)
-    #plt.plot(zneg)
-    #plt.plot(peaks_neg, zneg[peaks_neg], 'rx')
-    #plt.show()
     peaks, _ = find_peaks(z, height=0)
+    peaks_neg, _ = find_peaks(zneg, height=0)
+    # _, (ax1, ax2) = plt.subplots(2, sharex=True)
+    # ax1.set_title("z")
+    # ax1.plot(z)
+    # ax1.scatter(peaks, z[peaks])
+    # ax1.axvline(0.06 * len(y), label="first")
+    # ax1.axvline(0.94 * len(y), label="last")
+    # ax1.text(0.06 * len(y), 0, "first", rotation=90)
+    # ax1.text(0.94 * len(y), 0, "last", rotation=90)
+    # ax1.axhline(10, label="minimum")
+    # ax1.text(0, 10, "minimum")
+    # ax2.set_title("zneg")
+    # ax2.plot(zneg)
+    # ax2.scatter(peaks_neg, zneg[peaks_neg])
+    # ax2.axvline(first_nonzero, label="first nonzero")
+    # ax2.axvline(last_nonzero, label="last nonzero")
+    # ax2.text(first_nonzero, 0, "first nonzero", rotation=90)
+    # ax2.text(last_nonzero, 0, "last nonzero", rotation=90)
+    # ax2.axvline(370, label="first")
+    # ax2.axvline(len(y) - 370, label="last")
+    # ax2.text(370, 0, "first", rotation=90)
+    # ax2.text(len(y) - 370, 0, "last", rotation=90)
+    # plt.show()
     peaks_neg = peaks_neg - 10 - 10
 
-    last_nonzero = last_nonzero - 100
-    first_nonzero = first_nonzero + 200
-
-    peaks_neg = peaks_neg[(peaks_neg > first_nonzero) &
-                          (peaks_neg < last_nonzero)]
-    peaks = peaks[(peaks > 0.06 * regions_without_separators.shape[1]) &
-                  (peaks < 0.94 * regions_without_separators.shape[1])]
-    peaks_neg = peaks_neg[(peaks_neg > 370) &
-                          (peaks_neg < (regions_without_separators.shape[1] - 370))]
+    peaks = peaks[(peaks > 0.06 * len(y)) &
+                  (peaks < 0.94 * len(y))]
     interest_pos = z[peaks]
     interest_pos = interest_pos[interest_pos > 10]
     if not interest_pos.any():
         return 0, []
     # plt.plot(z)
     # plt.show()
+    peaks_neg = peaks_neg[(peaks_neg > first_nonzero) &
+                          (peaks_neg < last_nonzero)]
+    peaks_neg = peaks_neg[(peaks_neg > 370) &
+                          (peaks_neg < len(y) - 370)]
     interest_neg = z[peaks_neg]
     if not interest_neg.any():
         return 0, []
@@ -445,21 +463,28 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     min_peaks_pos = np.min(interest_pos)
     max_peaks_pos = np.max(interest_pos)
 
-    if max_peaks_pos / min_peaks_pos >= 35:
+    #print(min_peaks_pos, max_peaks_pos, max_peaks_pos / min_peaks_pos, 'minmax')
+    if max_peaks_pos / (min_peaks_pos or 1e-9) >= 35:
         min_peaks_pos = np.mean(interest_pos)
 
     min_peaks_neg = 0  # np.min(interest_neg)
 
-    # print(np.min(interest_pos),np.max(interest_pos),np.max(interest_pos)/np.min(interest_pos),'minmax')
     dis_talaei = (min_peaks_pos - min_peaks_neg) / multiplier
     grenze = min_peaks_pos - dis_talaei
-    # np.mean(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])-np.std(y[peaks_neg[0]:peaks_neg[len(peaks_neg)-1]])/2.0
+    #np.mean(y[peaks_neg[0]:peaks_neg[-1]])-np.std(y[peaks_neg[0]:peaks_neg[-1]])/2.0
 
     # print(interest_neg,'interest_neg')
     # print(grenze,'grenze')
     # print(min_peaks_pos,'min_peaks_pos')
     # print(dis_talaei,'dis_talaei')
     # print(peaks_neg,'peaks_neg')
+    # fig, (ax1, ax2) = plt.subplots(2, sharex=True)
+    # ax1.imshow(regions_without_separators, aspect="auto")
+    # ax2.plot(z)
+    # ax2.scatter(peaks_neg, z[peaks_neg])
+    # ax2.axhline(grenze, label="grenze")
+    # ax2.text(0, grenze, "grenze")
+    # plt.show()
 
     interest_neg_fin = interest_neg[(interest_neg < grenze)]
     peaks_neg_fin = peaks_neg[(interest_neg < grenze)]
@@ -479,46 +504,38 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
 
     # print(peaks_neg_fin,'peaks_neg_fin')
     # print(num_col,'diz')
-    p_l = 0
-    p_u = len(y) - 1
-    p_m = int(len(y) / 2.0)
-    p_g_l = int(len(y) / 4.0)
-    p_g_u = len(y) - int(len(y) / 4.0)
-
-    if num_col == 3:
-        if ((peaks_neg_fin[0] > p_g_u and
-             peaks_neg_fin[1] > p_g_u) or
-            (peaks_neg_fin[0] < p_g_l and
-             peaks_neg_fin[1] < p_g_l) or
-            (peaks_neg_fin[0] + 200 < p_m and
-             peaks_neg_fin[1] < p_m) or
-            (peaks_neg_fin[0] - 200 > p_m and
-             peaks_neg_fin[1] > p_m)):
-            num_col = 1
-            peaks_neg_fin = []
-
-    if num_col == 2:
-        if (peaks_neg_fin[0] > p_g_u or
-            peaks_neg_fin[0] < p_g_l):
-            num_col = 1
-            peaks_neg_fin = []
+    # cancel if resulting split is highly unbalanced across available width
+    if ((num_col == 3 and
+        ((peaks_neg_fin[0] > 0.75 * len(y) and
+          peaks_neg_fin[1] > 0.75 * len(y)) or
+         (peaks_neg_fin[0] < 0.25 * len(y) and
+          peaks_neg_fin[1] < 0.25 * len(y)) or
+         (peaks_neg_fin[0] < 0.5 * len(y) - 200 and
+          peaks_neg_fin[1] < 0.5 * len(y)) or
+         (peaks_neg_fin[0] > 0.5 * len(y) + 200 and
+          peaks_neg_fin[1] > 0.5 * len(y)))) or
+        (num_col == 2 and
+         (peaks_neg_fin[0] > 0.75 * len(y) or
+          peaks_neg_fin[0] < 0.25 * len(y)))):
+        num_col = 1
+        peaks_neg_fin = []
 
     ##print(len(peaks_neg_fin))
 
+    # filter out peaks that are too close (<400px) to each other:
+    # among each group, pick the position with smallest amount of text
     diff_peaks = np.abs(np.diff(peaks_neg_fin))
     cut_off = 400
     peaks_neg_true = []
     forest = []
-
     # print(len(peaks_neg_fin),'len_')
-
     for i in range(len(peaks_neg_fin)):
         if i == 0:
             forest.append(peaks_neg_fin[i])
         if i < len(peaks_neg_fin) - 1:
             if diff_peaks[i] <= cut_off:
                 forest.append(peaks_neg_fin[i + 1])
-            if diff_peaks[i] > cut_off:
+            else:
                 # print(forest[np.argmin(z[forest]) ] )
                 if not isNaN(forest[np.argmin(z[forest])]):
                     peaks_neg_true.append(forest[np.argmin(z[forest])])
@@ -530,68 +547,59 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
                 peaks_neg_true.append(forest[np.argmin(z[forest])])
 
     num_col = len(peaks_neg_true) + 1
-    p_l = 0
-    p_u = len(y) - 1
-    p_m = int(len(y) / 2.0)
-    p_quarter = int(len(y) / 5.0)
-    p_g_l = int(len(y) / 4.0)
-    p_g_u = len(y) - int(len(y) / 4.0)
-
-    p_u_quarter = len(y) - p_quarter
-
+    #print(peaks_neg_true, "peaks_neg_true")
     ##print(num_col,'early')
-    if num_col == 3:
-        if ((peaks_neg_true[0] > p_g_u and
-             peaks_neg_true[1] > p_g_u) or
-            (peaks_neg_true[0] < p_g_l and
-             peaks_neg_true[1] < p_g_l) or
-            (peaks_neg_true[0] < p_m and
-             peaks_neg_true[1] + 200 < p_m) or
-            (peaks_neg_true[0] - 200 > p_m and
-             peaks_neg_true[1] > p_m)):
-            num_col = 1
-            peaks_neg_true = []
-        elif (peaks_neg_true[0] < p_g_u and
-              peaks_neg_true[0] > p_g_l and
-              peaks_neg_true[1] > p_u_quarter):
-            peaks_neg_true = [peaks_neg_true[0]]
-        elif (peaks_neg_true[1] < p_g_u and
-              peaks_neg_true[1] > p_g_l and
-              peaks_neg_true[0] < p_quarter):
-            peaks_neg_true = [peaks_neg_true[1]]
-
-    if num_col == 2:
-        if (peaks_neg_true[0] > p_g_u or
-            peaks_neg_true[0] < p_g_l):
-            num_col = 1
-            peaks_neg_true = []
-
-    diff_peaks_abnormal = diff_peaks[diff_peaks < 360]
-
-    if len(diff_peaks_abnormal) > 0:
-        arg_help = np.arange(len(diff_peaks))
-        arg_help_ann = arg_help[diff_peaks < 360]
-
-        peaks_neg_fin_new = []
-
-        for ii in range(len(peaks_neg_fin)):
-            if ii in arg_help_ann:
-                arg_min = np.argmin([interest_neg_fin[ii], interest_neg_fin[ii + 1]])
-                if arg_min == 0:
-                    peaks_neg_fin_new.append(peaks_neg_fin[ii])
-                else:
-                    peaks_neg_fin_new.append(peaks_neg_fin[ii + 1])
-
-            elif (ii - 1) not in arg_help_ann:
-                peaks_neg_fin_new.append(peaks_neg_fin[ii])
-    else:
-        peaks_neg_fin_new = peaks_neg_fin
+    # cancel if resulting split is highly unbalanced across available width
+    if ((num_col == 3 and
+        ((peaks_neg_true[0] > 0.75 * len(y) and
+          peaks_neg_true[1] > 0.75 * len(y)) or
+         (peaks_neg_true[0] < 0.25 * len(y) and
+          peaks_neg_true[1] < 0.25 * len(y)) or
+         (peaks_neg_true[0] < 0.5 * len(y) - 200 and
+          peaks_neg_true[1] < 0.5 * len(y)) or
+         (peaks_neg_true[0] > 0.5 * len(y) + 200 and
+          peaks_neg_true[1] > 0.5 * len(y)))) or
+        (num_col == 2 and
+         (peaks_neg_true[0] > 0.75 * len(y) or
+          peaks_neg_true[0] < 0.25 * len(y)))):
+        num_col = 1
+        peaks_neg_true = []
+    if (num_col == 3 and
+        (peaks_neg_true[0] < 0.75 * len(y) and
+         peaks_neg_true[0] > 0.25 * len(y) and
+         peaks_neg_true[1] > 0.80 * len(y))):
+        num_col = 2
+        peaks_neg_true = [peaks_neg_true[0]]
+    if (num_col == 3 and
+        (peaks_neg_true[1] < 0.75 * len(y) and
+         peaks_neg_true[1] > 0.25 * len(y) and
+         peaks_neg_true[0] < 0.20 * len(y))):
+        num_col = 2
+        peaks_neg_true = [peaks_neg_true[1]]
+
+    # get rid of too narrow columns (not used)
+    # if np.count_nonzero(diff_peaks < 360):
+    #     arg_help = np.arange(len(diff_peaks))
+    #     arg_help_ann = arg_help[diff_peaks < 360]
+    #     peaks_neg_fin_new = []
+    #     for ii in range(len(peaks_neg_fin)):
+    #         if ii in arg_help_ann:
+    #             if interest_neg_fin[ii] < interest_neg_fin[ii + 1]:
+    #                 peaks_neg_fin_new.append(peaks_neg_fin[ii])
+    #             else:
+    #                 peaks_neg_fin_new.append(peaks_neg_fin[ii + 1])
+
+    #         elif (ii - 1) not in arg_help_ann:
+    #             peaks_neg_fin_new.append(peaks_neg_fin[ii])
+    # else:
+    #     peaks_neg_fin_new = peaks_neg_fin
 
     # plt.plot(gaussian_filter1d(y, sigma_))
     # plt.plot(peaks_neg_true,z[peaks_neg_true],'*')
     # plt.plot([0,len(y)], [grenze,grenze])
     # plt.show()
     ##print(len(peaks_neg_true))
+    #print(peaks_neg_true, "peaks_neg_true")
     return len(peaks_neg_true), peaks_neg_true
 
 def find_num_col_only_image(regions_without_separators, multiplier=3.8):

From c43a825d1d26c36beee3bbc2e038f8c0cda4221b Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 13:26:01 +0200
Subject: [PATCH 04/21] `order_of_regions`: filter out-of-image peaks

---
 src/eynollah/utils/__init__.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index ce72df4..677ed53 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1216,15 +1216,16 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref):
     peaks_neg, _ = find_peaks(zneg, height=0)
     peaks_neg = peaks_neg - 20 - 20
 
-    ##plt.plot(z)
-    ##plt.show()
-    cx_main, cy_main = find_center_of_contours(contours_main)
-    cx_head, cy_head = find_center_of_contours(contours_head)
-
-    peaks_neg_new = np.append(np.insert(peaks_neg, 0, 0), textline_mask.shape[0])
+    peaks_neg_new = np.array([0] +
+                             # peaks can be beyond box due to padding and smoothing
+                             [peak for peak in peaks_neg
+                              if 0 < peak and peak < textline_mask.shape[0]] +
+                             [textline_mask.shape[0]])
     # offset from bbox of mask
     peaks_neg_new += y_ref
 
+    cx_main, cy_main = find_center_of_contours(contours_main)
+    cx_head, cy_head = find_center_of_contours(contours_head)
     # assert not len(cy_main) or np.min(peaks_neg_new) <= np.min(cy_main) and np.max(cy_main) <= np.max(peaks_neg_new)
     # assert not len(cy_head) or np.min(peaks_neg_new) <= np.min(cy_head) and np.max(cy_head) <= np.max(peaks_neg_new)
 

From d3d599b0108bf17802bda2f9808620e3cd8471db Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 13:27:23 +0200
Subject: [PATCH 05/21] `order_of_regions`: add better plotting (but commented
 out)

---
 src/eynollah/eynollah.py       |  2 +-
 src/eynollah/utils/__init__.py | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py
index 13acba6..9412861 100644
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@@ -2553,7 +2553,7 @@ def match_boxes(only_centers: bool):
                 con_inter_box_h = contours_only_text_parent_h[args_contours_box_head]
 
                 indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(
-                    textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2])
+                    textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0])
 
                 order_of_texts, id_of_texts = order_and_id_of_texts(
                     con_inter_box, con_inter_box_h,
diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 677ed53..f2e3581 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1197,7 +1197,7 @@ def small_textlines_to_parent_adherence2(textlines_con, textline_iamge, num_col)
         textlines_con_changed.append(textlines_big_org_form)
     return textlines_con_changed
 
-def order_of_regions(textline_mask, contours_main, contours_head, y_ref):
+def order_of_regions(textline_mask, contours_main, contours_head, y_ref, x_ref):
     ##plt.imshow(textline_mask)
     ##plt.show()
     y = textline_mask.sum(axis=1) # horizontal projection profile
@@ -1208,6 +1208,8 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref):
     #z = gaussian_filter1d(y_padded, sigma_gaus)
     #peaks, _ = find_peaks(z, height=0)
     #peaks = peaks - 20
+    ##plt.plot(z)
+    ##plt.show()
     zneg_rev = np.max(y_padded) - y_padded
     zneg = np.zeros(len(zneg_rev) + 40)
     zneg[20 : len(zneg_rev) + 20] = zneg_rev
@@ -1250,6 +1252,22 @@ def order_of_regions(textline_mask, contours_main, contours_head, y_ref):
         indexes_in, types_in, cxs_in, cys_in, typed_indexes_in = \
              matrix_of_orders[(matrix_of_orders[:, 3] >= top) &
                               (matrix_of_orders[:, 3] < bot)].T
+        # if indexes_in.size:
+        #     img = textline_mask.copy()
+        #     plt.imshow(img)
+        #     plt.gca().add_patch(patches.Rectangle((0, top-y_ref), img.shape[1], bot-top, alpha=0.5, color='gray'))
+        #     xrange = np.arange(0, img.shape[1], 50)
+        #     yrange = np.arange(0, img.shape[0], 50)
+        #     plt.gca().set_xticks(xrange, xrange + x_ref)
+        #     plt.gca().set_yticks(yrange, yrange + y_ref)
+        #     for idx, type_, cx, cy in zip(typed_indexes_in, types_in, cxs_in, cys_in):
+        #         cnt = (contours_main if type_ == 1 else contours_head)[idx]
+        #         col = 'red' if type_ == 1 else 'blue'
+        #         plt.scatter(cx - x_ref, cy - y_ref, 20, c=col, marker='o')
+        #         plt.gca().add_patch(patches.Polygon(cnt[:, 0] - [[x_ref, y_ref]], closed=False, fill=False, color=col))
+        #     plt.title("box contours centered in %d:%d (red=main / blue=heading)" % (top, bot))
+        #     plt.show()
+
         sorted_inside = np.argsort(cxs_in)
         final_indexers_sorted.extend(indexes_in[sorted_inside])
         final_types.extend(types_in[sorted_inside])

From 542d38ab432e3089ebc8fefd3caee2915fe6b031 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 13:34:56 +0200
Subject: [PATCH 06/21] =?UTF-8?q?`find=5Fnumber=5Fof=5Fcolumns=5Fin=5Fdocu?=
 =?UTF-8?q?ment`:=20simplify,=20rename=20`line`=E2=86=92`seps`?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/eynollah/utils/__init__.py | 260 +++++++++++++++------------------
 1 file changed, 117 insertions(+), 143 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index f2e3581..168899f 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1377,175 +1377,149 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point):
     peaks_neg_tot.append(last_point)
     return peaks_neg_tot
 
-def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_lines, contours_h=None):
-    t_ins_c0 = time.time()
-    separators_closeup=( (region_pre_p[:,:]==label_lines))*1
-    separators_closeup[0:110,:]=0
-    separators_closeup[separators_closeup.shape[0]-150:,:]=0
+def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None):
+    separators_closeup = 1 * (region_pre_p == label_seps)
+    separators_closeup[0:110] = 0
+    separators_closeup[-150:] = 0
 
     kernel = np.ones((5,5),np.uint8)
-    separators_closeup=separators_closeup.astype(np.uint8)
-    separators_closeup = cv2.dilate(separators_closeup,kernel,iterations = 1)
-    separators_closeup = cv2.erode(separators_closeup,kernel,iterations = 1)
-
-    separators_closeup_new=np.zeros((separators_closeup.shape[0] ,separators_closeup.shape[1] ))
-    separators_closeup_n=np.copy(separators_closeup)
-    separators_closeup_n=separators_closeup_n.astype(np.uint8)
-
-    separators_closeup_n_binary=np.zeros(( separators_closeup_n.shape[0],separators_closeup_n.shape[1]) )
-    separators_closeup_n_binary[:,:]=separators_closeup_n[:,:]
-    separators_closeup_n_binary[:,:][separators_closeup_n_binary[:,:]!=0]=1
-
-    _, thresh_e = cv2.threshold(separators_closeup_n_binary, 0, 255, 0)
-    contours_line_e, _ = cv2.findContours(thresh_e.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-    _, dist_xe, _, _, _, _, y_min_main, y_max_main, _ = \
-        find_features_of_lines(contours_line_e)
-    dist_ye = y_max_main - y_min_main
-    args_e=np.arange(len(contours_line_e))
-    args_hor_e=args_e[(dist_ye<=50) &
-                      (dist_xe>=3*dist_ye)]
-    cnts_hor_e=[]
-    for ce in args_hor_e:
-        cnts_hor_e.append(contours_line_e[ce])
-
-    separators_closeup_n_binary=cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0)
-    gray = cv2.bitwise_not(separators_closeup_n_binary)
-    gray=gray.astype(np.uint8)
-
-    bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, \
-                               cv2.THRESH_BINARY, 15, -2)
-    horizontal = np.copy(bw)
-    vertical = np.copy(bw)
-
-    cols = horizontal.shape[1]
-    horizontal_size = cols // 30
-    # Create structure element for extracting horizontal lines through morphology operations
+    separators_closeup = separators_closeup.astype(np.uint8)
+    separators_closeup = cv2.morphologyEx(separators_closeup, cv2.MORPH_CLOSE, kernel, iterations=1)
+
+    separators_closeup_n = separators_closeup.astype(np.uint8) # to be returned
+
+    separators_closeup_n_binary = separators_closeup_n.copy()
+
+    # find horizontal lines by contour properties
+    contours_sep_e, _ = cv2.findContours(separators_closeup_n_binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    cnts_hor_e = []
+    for cnt in contours_sep_e:
+        max_xe = cnt[:, 0, 0].max()
+        min_xe = cnt[:, 0, 0].min()
+        max_ye = cnt[:, 0, 1].max()
+        min_ye = cnt[:, 0, 1].min()
+        dist_xe = max_xe - min_xe
+        dist_ye = max_ye - min_ye
+        if dist_ye <= 50 and dist_xe >= 3 * dist_ye:
+            cnts_hor_e.append(cnt)
+
+    # delete horizontal contours (leaving only the edges)
+    separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0)
+    edges = cv2.adaptiveThreshold(separators_closeup_n_binary * 255, 255,
+                                  cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)
+    horizontal = np.copy(edges)
+    vertical = np.copy(edges)
+
+    horizontal_size = horizontal.shape[1] // 30
+    # find horizontal lines by morphology
     horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
-    # Apply morphology operations
-    horizontal = cv2.erode(horizontal, horizontalStructure)
-    horizontal = cv2.dilate(horizontal, horizontalStructure)
-
-    kernel = np.ones((5,5),np.uint8)
-    horizontal = cv2.dilate(horizontal,kernel,iterations = 2)
-    horizontal = cv2.erode(horizontal,kernel,iterations = 2)
+    horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_OPEN, horizontalStructure)
+    horizontal = cv2.morphologyEx(horizontal, cv2.MORPH_CLOSE, kernel, iterations=2)
+    # re-insert deleted horizontal contours
     horizontal = cv2.fillPoly(horizontal, pts=cnts_hor_e, color=255)
 
-    rows = vertical.shape[0]
-    verticalsize = rows // 30
-    # Create structure element for extracting vertical lines through morphology operations
-    verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
-    # Apply morphology operations
-    vertical = cv2.erode(vertical, verticalStructure)
-    vertical = cv2.dilate(vertical, verticalStructure)
-    vertical = cv2.dilate(vertical,kernel,iterations = 1)
+    vertical_size = vertical.shape[0] // 30
+    # find vertical lines by morphology
+    verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
+    vertical = cv2.morphologyEx(vertical, cv2.MORPH_OPEN, verticalStructure)
+    vertical = cv2.dilate(vertical, kernel, iterations=1)
 
     horizontal, special_separators = \
         combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new(
             vertical, horizontal, num_col_classifier)
 
-    separators_closeup_new[:,:][vertical[:,:]!=0]=1
-    separators_closeup_new[:,:][horizontal[:,:]!=0]=1
-
     _, thresh = cv2.threshold(vertical, 0, 255, 0)
-    contours_line_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-    slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \
-        find_features_of_lines(contours_line_vers)
-
-    args=np.arange(len(slope_lines))
-    args_ver=args[slope_lines==1]
-    dist_x_ver=dist_x[slope_lines==1]
-    y_min_main_ver=y_min_main[slope_lines==1]
-    y_max_main_ver=y_max_main[slope_lines==1]
-    x_min_main_ver=x_min_main[slope_lines==1]
-    x_max_main_ver=x_max_main[slope_lines==1]
-    cx_main_ver=cx_main[slope_lines==1]
-    dist_y_ver=y_max_main_ver-y_min_main_ver
+    contours_sep_vers, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \
+        find_features_of_lines(contours_sep_vers)
+
+    args=np.arange(len(slope_seps))
+    args_ver=args[slope_seps==1]
+    dist_x_ver=dist_x[slope_seps==1]
+    y_min_seps_ver=y_min_seps[slope_seps==1]
+    y_max_seps_ver=y_max_seps[slope_seps==1]
+    x_min_seps_ver=x_min_seps[slope_seps==1]
+    x_max_seps_ver=x_max_seps[slope_seps==1]
+    cx_seps_ver=cx_seps[slope_seps==1]
+    dist_y_ver=y_max_seps_ver-y_min_seps_ver
     len_y=separators_closeup.shape[0]/3.0
 
     _, thresh = cv2.threshold(horizontal, 0, 255, 0)
-    contours_line_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-    slope_lines, dist_x, x_min_main, x_max_main, cy_main, slope_lines_org, y_min_main, y_max_main, cx_main = \
-        find_features_of_lines(contours_line_hors)
+    contours_sep_hors, _ = cv2.findContours(thresh.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    slope_seps, dist_x, x_min_seps, x_max_seps, cy_seps, slope_seps_org, y_min_seps, y_max_seps, cx_seps = \
+        find_features_of_lines(contours_sep_hors)
 
-    slope_lines_org_hor=slope_lines_org[slope_lines==0]
-    args=np.arange(len(slope_lines))
+    slope_seps_org_hor=slope_seps_org[slope_seps==0]
+    args=np.arange(len(slope_seps))
     len_x=separators_closeup.shape[1]/5.0
-    dist_y=np.abs(y_max_main-y_min_main)
+    dist_y=np.abs(y_max_seps-y_min_seps)
 
-    args_hor=args[slope_lines==0]
-    dist_x_hor=dist_x[slope_lines==0]
-    y_min_main_hor=y_min_main[slope_lines==0]
-    y_max_main_hor=y_max_main[slope_lines==0]
-    x_min_main_hor=x_min_main[slope_lines==0]
-    x_max_main_hor=x_max_main[slope_lines==0]
-    dist_y_hor=dist_y[slope_lines==0]
-    cy_main_hor=cy_main[slope_lines==0]
+    args_hor=args[slope_seps==0]
+    dist_x_hor=dist_x[slope_seps==0]
+    y_min_seps_hor=y_min_seps[slope_seps==0]
+    y_max_seps_hor=y_max_seps[slope_seps==0]
+    x_min_seps_hor=x_min_seps[slope_seps==0]
+    x_max_seps_hor=x_max_seps[slope_seps==0]
+    dist_y_hor=dist_y[slope_seps==0]
+    cy_seps_hor=cy_seps[slope_seps==0]
 
     args_hor=args_hor[dist_x_hor>=len_x/2.0]
-    x_max_main_hor=x_max_main_hor[dist_x_hor>=len_x/2.0]
-    x_min_main_hor=x_min_main_hor[dist_x_hor>=len_x/2.0]
-    cy_main_hor=cy_main_hor[dist_x_hor>=len_x/2.0]
-    y_min_main_hor=y_min_main_hor[dist_x_hor>=len_x/2.0]
-    y_max_main_hor=y_max_main_hor[dist_x_hor>=len_x/2.0]
+    x_max_seps_hor=x_max_seps_hor[dist_x_hor>=len_x/2.0]
+    x_min_seps_hor=x_min_seps_hor[dist_x_hor>=len_x/2.0]
+    cy_seps_hor=cy_seps_hor[dist_x_hor>=len_x/2.0]
+    y_min_seps_hor=y_min_seps_hor[dist_x_hor>=len_x/2.0]
+    y_max_seps_hor=y_max_seps_hor[dist_x_hor>=len_x/2.0]
     dist_y_hor=dist_y_hor[dist_x_hor>=len_x/2.0]
-    slope_lines_org_hor=slope_lines_org_hor[dist_x_hor>=len_x/2.0]
+    slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0]
     dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0]
 
-    matrix_of_lines_ch=np.zeros((len(cy_main_hor)+len(cx_main_ver),10))
-    matrix_of_lines_ch[:len(cy_main_hor),0]=args_hor
-    matrix_of_lines_ch[len(cy_main_hor):,0]=args_ver
-    matrix_of_lines_ch[len(cy_main_hor):,1]=cx_main_ver
-    matrix_of_lines_ch[:len(cy_main_hor),2]=x_min_main_hor+50#x_min_main_hor+150
-    matrix_of_lines_ch[len(cy_main_hor):,2]=x_min_main_ver
-    matrix_of_lines_ch[:len(cy_main_hor),3]=x_max_main_hor-50#x_max_main_hor-150
-    matrix_of_lines_ch[len(cy_main_hor):,3]=x_max_main_ver
-    matrix_of_lines_ch[:len(cy_main_hor),4]=dist_x_hor
-    matrix_of_lines_ch[len(cy_main_hor):,4]=dist_x_ver
-    matrix_of_lines_ch[:len(cy_main_hor),5]=cy_main_hor
-    matrix_of_lines_ch[:len(cy_main_hor),6]=y_min_main_hor
-    matrix_of_lines_ch[len(cy_main_hor):,6]=y_min_main_ver
-    matrix_of_lines_ch[:len(cy_main_hor),7]=y_max_main_hor
-    matrix_of_lines_ch[len(cy_main_hor):,7]=y_max_main_ver
-    matrix_of_lines_ch[:len(cy_main_hor),8]=dist_y_hor
-    matrix_of_lines_ch[len(cy_main_hor):,8]=dist_y_ver
-    matrix_of_lines_ch[len(cy_main_hor):,9]=1
+    matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10))
+    matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor
+    matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver
+    matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver
+    matrix_of_seps_ch[:len(cy_seps_hor),2]=x_min_seps_hor+50#x_min_seps_hor+150
+    matrix_of_seps_ch[len(cy_seps_hor):,2]=x_min_seps_ver
+    matrix_of_seps_ch[:len(cy_seps_hor),3]=x_max_seps_hor-50#x_max_seps_hor-150
+    matrix_of_seps_ch[len(cy_seps_hor):,3]=x_max_seps_ver
+    matrix_of_seps_ch[:len(cy_seps_hor),4]=dist_x_hor
+    matrix_of_seps_ch[len(cy_seps_hor):,4]=dist_x_ver
+    matrix_of_seps_ch[:len(cy_seps_hor),5]=cy_seps_hor
+    matrix_of_seps_ch[:len(cy_seps_hor),6]=y_min_seps_hor
+    matrix_of_seps_ch[len(cy_seps_hor):,6]=y_min_seps_ver
+    matrix_of_seps_ch[:len(cy_seps_hor),7]=y_max_seps_hor
+    matrix_of_seps_ch[len(cy_seps_hor):,7]=y_max_seps_ver
+    matrix_of_seps_ch[:len(cy_seps_hor),8]=dist_y_hor
+    matrix_of_seps_ch[len(cy_seps_hor):,8]=dist_y_ver
+    matrix_of_seps_ch[len(cy_seps_hor):,9]=1
 
     if contours_h is not None:
-        _, dist_x_head, x_min_main_head, x_max_main_head, cy_main_head, _, y_min_main_head, y_max_main_head, _ = \
+        _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \
             find_features_of_lines(contours_h)
-        matrix_l_n=np.zeros((matrix_of_lines_ch.shape[0]+len(cy_main_head),matrix_of_lines_ch.shape[1]))
-        matrix_l_n[:matrix_of_lines_ch.shape[0],:]=np.copy(matrix_of_lines_ch[:,:])
-        args_head=np.arange(len(cy_main_head)) + len(cy_main_hor)
-
-        matrix_l_n[matrix_of_lines_ch.shape[0]:,0]=args_head
-        matrix_l_n[matrix_of_lines_ch.shape[0]:,2]=x_min_main_head+30
-        matrix_l_n[matrix_of_lines_ch.shape[0]:,3]=x_max_main_head-30
-        matrix_l_n[matrix_of_lines_ch.shape[0]:,4]=dist_x_head
-        matrix_l_n[matrix_of_lines_ch.shape[0]:,5]=y_min_main_head-3-8
-        matrix_l_n[matrix_of_lines_ch.shape[0]:,6]=y_min_main_head-5-8
-        matrix_l_n[matrix_of_lines_ch.shape[0]:,7]=y_max_main_head#y_min_main_head+1-8
-        matrix_l_n[matrix_of_lines_ch.shape[0]:,8]=4
-        matrix_of_lines_ch=np.copy(matrix_l_n)
-
-    cy_main_splitters=cy_main_hor[(x_min_main_hor<=.16*region_pre_p.shape[1]) &
-                                  (x_max_main_hor>=.84*region_pre_p.shape[1])]
-    cy_main_splitters=np.array( list(cy_main_splitters)+list(special_separators))
+        matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]))
+        args_head = np.arange(len(cy_head))
+        matrix_l_n[:, 0] = args_head
+        matrix_l_n[:, 2] = x_min_head+30
+        matrix_l_n[:, 3] = x_max_head-30
+        matrix_l_n[:, 4] = dist_x_head
+        matrix_l_n[:, 5] = y_min_head-3-8
+        matrix_l_n[:, 6] = y_min_head-5-8
+        matrix_l_n[:, 7] = y_max_head#y_min_head+1-8
+        matrix_l_n[:, 8] = 4
+        matrix_of_seps_ch = np.append(
+            matrix_of_seps_ch, matrix_l_n, axis=0)
+
+    cy_seps_splitters=cy_seps_hor[(x_min_seps_hor<=.16*region_pre_p.shape[1]) &
+                                  (x_max_seps_hor>=.84*region_pre_p.shape[1])]
+    cy_seps_splitters = np.append(cy_seps_splitters, special_separators)
+
     if contours_h is not None:
-        try:
-            cy_main_splitters_head=cy_main_head[(x_min_main_head<=.16*region_pre_p.shape[1]) &
-                                                (x_max_main_head>=.84*region_pre_p.shape[1])]
-            cy_main_splitters=np.array( list(cy_main_splitters)+list(cy_main_splitters_head))
-        except:
-            pass
-    args_cy_splitter=np.argsort(cy_main_splitters)
-    cy_main_splitters_sort=cy_main_splitters[args_cy_splitter]
+        cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) &
+                                       (x_max_head>=.84*region_pre_p.shape[1])]
+        cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head)
 
-    splitter_y_new=[]
-    splitter_y_new.append(0)
-    for i in range(len(cy_main_splitters_sort)):
-        splitter_y_new.append(  cy_main_splitters_sort[i] )
-    splitter_y_new.append(region_pre_p.shape[0])
-    splitter_y_new_diff=np.diff(splitter_y_new)/float(region_pre_p.shape[0])*100
+    cy_seps_splitters = np.sort(cy_seps_splitters)
+    splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]]
+    splitter_y_new_diff = np.diff(splitter_y_new) / float(region_pre_p.shape[0]) * 100
 
     args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ]
 
@@ -1573,7 +1547,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
         peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)]
         peaks_neg_fin_fin=peaks_neg_fin[:]
 
-    return num_col_fin, peaks_neg_fin_fin,matrix_of_lines_ch,splitter_y_new,separators_closeup_n
+    return num_col_fin, peaks_neg_fin_fin, matrix_of_seps_ch, splitter_y_new, separators_closeup_n
 
 def return_boxes_of_images_by_order_of_reading_new(
         splitter_y_new, regions_without_separators,

From 5a0e4c3b0f2e089acff0b4fbf058f1d2e6f90f66 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 13:36:10 +0200
Subject: [PATCH 07/21] `find_number_of_columns_in_document`: improve splitter
 rule

extend horizontal separators to full img width if they do not overlap
any other regions

(only as regards to returned `splitter_y` result,
 but without changing returned separators mask)
---
 src/eynollah/utils/__init__.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 168899f..b930bfd 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1378,6 +1378,8 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point):
     return peaks_neg_tot
 
 def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None):
+    ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8))
+
     separators_closeup = 1 * (region_pre_p == label_seps)
     separators_closeup[0:110] = 0
     separators_closeup[-150:] = 0
@@ -1398,10 +1400,19 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
         min_xe = cnt[:, 0, 0].min()
         max_ye = cnt[:, 0, 1].max()
         min_ye = cnt[:, 0, 1].min()
+        med_ye = int(np.median(cnt[:, 0, 1]))
         dist_xe = max_xe - min_xe
         dist_ye = max_ye - min_ye
         if dist_ye <= 50 and dist_xe >= 3 * dist_ye:
             cnts_hor_e.append(cnt)
+            labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0])
+            if len(labels) == 1:
+                # mid line does not intersect with any other region
+                # so add it as extra splitter line
+                cnts_hor_e.append(np.array([[[0, med_ye]],
+                                            [[ccomps.shape[1], med_ye]],
+                                            [[ccomps.shape[1], med_ye + 1]],
+                                            [[0, med_ye + 1]]]))
 
     # delete horizontal contours (leaving only the edges)
     separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0)

From cd35241e816acc7e2083dc31d99f376a8877904b Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 13:41:36 +0200
Subject: [PATCH 08/21] `find_number_of_columns_in_document`: split headings at
 top+baseline

regarding `splitter_y` result, for headings, instead of cutting right
through them via center line, add their toplines and baselines as if
they were horizontal separators
---
 src/eynollah/utils/__init__.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index b930bfd..0c3e4ae 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1506,15 +1506,33 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
     if contours_h is not None:
         _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \
             find_features_of_lines(contours_h)
+        # matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]))
+        # args_head = np.arange(len(cy_head))
+        # matrix_l_n[:, 0] = args_head
+        # matrix_l_n[:, 2] = x_min_head+30
+        # matrix_l_n[:, 3] = x_max_head-30
+        # matrix_l_n[:, 4] = dist_x_head
+        # matrix_l_n[:, 5] = y_min_head-3-8
+        # matrix_l_n[:, 6] = y_min_head-5-8
+        # matrix_l_n[:, 7] = y_max_head#y_min_head+1-8
+        # matrix_l_n[:, 8] = 4
+        # split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head):
+        cy_head = np.stack((y_min_head, y_max_head)).T.flatten()
+        y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(),
+                                  np.stack((y_min_head + 2, y_max_head + 2)).T.flatten())
+        x_min_head = np.repeat(x_min_head, 2)
+        x_max_head = np.repeat(x_max_head, 2)
+        dist_x_head = np.repeat(dist_x_head, 2)
         matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]))
         args_head = np.arange(len(cy_head))
         matrix_l_n[:, 0] = args_head
-        matrix_l_n[:, 2] = x_min_head+30
-        matrix_l_n[:, 3] = x_max_head-30
+        # +/- 30px to avoid crossing col peaks by accident
+        matrix_l_n[:, 2] = x_min_head + 30
+        matrix_l_n[:, 3] = x_max_head - 30
         matrix_l_n[:, 4] = dist_x_head
-        matrix_l_n[:, 5] = y_min_head-3-8
-        matrix_l_n[:, 6] = y_min_head-5-8
-        matrix_l_n[:, 7] = y_max_head#y_min_head+1-8
+        matrix_l_n[:, 5] = cy_head
+        matrix_l_n[:, 6] = y_min_head
+        matrix_l_n[:, 7] = y_max_head
         matrix_l_n[:, 8] = 4
         matrix_of_seps_ch = np.append(
             matrix_of_seps_ch, matrix_l_n, axis=0)

From 7c3e41858877211c82f5b6c91a02fccfe146cacb Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 16:13:51 +0200
Subject: [PATCH 09/21] `return_boxes_of_images_by_order_of_reading_new`:
 simplify

- enumeration instead of indexing
- array instead of list operations
- add better plotting (but commented out)
---
 src/eynollah/utils/__init__.py | 351 ++++++++++++++++-----------------
 1 file changed, 166 insertions(+), 185 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 0c3e4ae..698b0bd 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -5,6 +5,7 @@
 
 try:
     import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
 except ImportError:
     plt = None
 import numpy as np
@@ -20,6 +21,7 @@
                       return_contours_of_image,
                       return_parent_contours)
 
+
 def pairwise(iterable):
     # pairwise('ABCDEFG') → AB BC CD DE EF FG
 
@@ -205,15 +207,15 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
     #print(x_end,'x_end')
     #print(len_sep)
 
-    deleted=[]
+    deleted = set()
     for i in range(len(x_start)-1):
         nodes_i=set(range(x_start[i],x_end[i]+1))
         for j in range(i+1,len(x_start)):
             if nodes_i==set(range(x_start[j],x_end[j]+1)):
-                    deleted.append(j)
+                deleted.add(j)
     #print(np.unique(deleted))
 
-    remained_sep_indexes=set(range(len(x_start)))-set(np.unique(deleted) )
+    remained_sep_indexes = set(range(len(x_start))) - deleted
     #print(remained_sep_indexes,'remained_sep_indexes')
     mother=[]#if it has mother
     child=[]
@@ -262,7 +264,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
         x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother]
         y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother]
 
-        reading_orther_type=0
+        reading_order_type=0
         x_end_without_mother = x_end[remained_sep_indexes_without_mother]
         x_start_without_mother = x_start[remained_sep_indexes_without_mother]
         y_lines_without_mother = y_sep[remained_sep_indexes_without_mother]
@@ -278,12 +280,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
                                       x_end[remained_sep_indexes_without_mother[j]]
                                       # + 1
                                       ))
-                    set_diff = nodes_i - nodes_j
-                    if set_diff != nodes_i:
-                        reading_orther_type = 1
+                    if nodes_i - nodes_j != nodes_i:
+                        reading_order_type = 1
     else:
-        reading_orther_type = 0
-    #print(reading_orther_type,'javab')
+        reading_order_type = 0
+    #print(reading_order_type,'javab')
     #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother')
     #print(x_start_with_child_without_mother,'x_start_with_child_without_mother')
     #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother')
@@ -297,7 +298,7 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
     #print(all_args_uniq,'all_args_uniq')
     #print(args_to_be_unified,'args_to_be_unified')
 
-    return (reading_orther_type,
+    return (reading_order_type,
             x_start_returned,
             x_end_returned,
             y_sep_returned,
@@ -1590,77 +1591,90 @@ def return_boxes_of_images_by_order_of_reading_new(
     if logger is None:
         logger = getLogger(__package__)
     logger.debug('enter return_boxes_of_images_by_order_of_reading_new')
+    # def dbg_plt(box=None, title=None):
+    #     if box is None:
+    #         box = [None, None, None, None]
+    #     img = regions_without_separators[box[2]:box[3], box[0]:box[1]]
+    #     plt.imshow(img)
+    #     xrange = np.arange(0, img.shape[1], 100)
+    #     yrange = np.arange(0, img.shape[0], 100)
+    #     plt.gca().set_xticks(xrange, xrange + (box[0] or 0))
+    #     plt.gca().set_yticks(yrange, yrange + (box[2] or 0))
+    #     if title:
+    #         plt.title(title)
+    #     plt.show()
+    # dbg_plt()
 
     boxes=[]
     peaks_neg_tot_tables = []
     splitter_y_new = np.array(splitter_y_new, dtype=int)
-    for i in range(len(splitter_y_new)-1):
-        #print(splitter_y_new[i],splitter_y_new[i+1])
-        matrix_new = matrix_of_lines_ch[:,:][(matrix_of_lines_ch[:,6]> splitter_y_new[i] ) &
-                                             (matrix_of_lines_ch[:,7]< splitter_y_new[i+1] )]
+    width_tot = regions_without_separators.shape[1]
+    for top, bot in pairwise(splitter_y_new):
+        # print("%d:%d" % (top, bot), 'i')
+        # dbg_plt([None, None, top, bot],
+        #         "image cut for y split %d:%d" % (
+        #             top, bot))
+        matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) &
+                                        (matrix_of_lines_ch[:,7] < bot)]
         #print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
         #print(matrix_new[:,8][matrix_new[:,9]==1],'gaddaaa')
         # check to see is there any vertical separator to find holes.
         #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and
         #    np.max(matrix_new[:,8][matrix_new[:,9]==1]) >=
-        #    0.1 * (np.abs(splitter_y_new[i+1]-splitter_y_new[i]))):
+        #    0.1 * (np.abs(bot-top))):
         if True:
             try:
                 num_col, peaks_neg_fin = find_num_col(
-                    regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :],
+                    regions_without_separators[top:bot],
                     num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.)
             except:
                 peaks_neg_fin=[]
                 num_col = 0
             try:
                 if (len(peaks_neg_fin)+1)<num_col_classifier or num_col_classifier==6:
+                    # found too few columns here
                     #print('burda')
                     peaks_neg_fin_org = np.copy(peaks_neg_fin)
+                    #print("peaks_neg_fin_org", peaks_neg_fin_org)
                     if len(peaks_neg_fin)==0:
                         num_col, peaks_neg_fin = find_num_col(
-                            regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1], :],
+                            regions_without_separators[top:bot],
                             num_col_classifier, tables, multiplier=3.)
-                    peaks_neg_fin_early=[]
-                    peaks_neg_fin_early.append(0)
                     #print(peaks_neg_fin,'peaks_neg_fin')
-                    for p_n in peaks_neg_fin:
-                        peaks_neg_fin_early.append(p_n)
-                    peaks_neg_fin_early.append(regions_without_separators.shape[1]-1)
+                    peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1]
 
                     #print(peaks_neg_fin_early,'burda2')
                     peaks_neg_fin_rev=[]
-                    for i_n in range(len(peaks_neg_fin_early)-1):
-                        #print(i_n,'i_n')
-                        #plt.plot(regions_without_separators[splitter_y_new[i]:
-                        #                                    splitter_y_new[i+1],
-                        #                                    peaks_neg_fin_early[i_n]:
-                        #                                    peaks_neg_fin_early[i_n+1]].sum(axis=0) )
-                        #plt.show()
+                    for left, right in pairwise(peaks_neg_fin_early):
+                        # print("%d:%d" % (left, right), 'i_n')
+                        # dbg_plt([left, right, top, bot],
+                        #         "image cut for y split %d:%d / x gap %d:%d" % (
+                        #             top, bot, left, right))
+                        # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0))
+                        # plt.title("vertical projection (sum over y)")
+                        # plt.show()
                         try:
-                            num_col, peaks_neg_fin1 = find_num_col(
-                                regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],
-                                                           peaks_neg_fin_early[i_n]:peaks_neg_fin_early[i_n+1]],
-                                num_col_classifier,tables, multiplier=7.)
+                            _, peaks_neg_fin1 = find_num_col(
+                                regions_without_separators[top:bot, left:right],
+                                num_col_classifier, tables, multiplier=7.)
                         except:
-                            peaks_neg_fin1=[]
+                            peaks_neg_fin1 = []
                         try:
-                            num_col, peaks_neg_fin2 = find_num_col(
-                                regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],
-                                                           peaks_neg_fin_early[i_n]:peaks_neg_fin_early[i_n+1]],
-                                num_col_classifier,tables, multiplier=5.)
+                            _, peaks_neg_fin2 = find_num_col(
+                                regions_without_separators[top:bot, left:right],
+                                num_col_classifier, tables, multiplier=5.)
                         except:
-                            peaks_neg_fin2=[]
-
-                        if len(peaks_neg_fin1)>=len(peaks_neg_fin2):
-                            peaks_neg_fin=list(np.copy(peaks_neg_fin1))
+                            peaks_neg_fin2 = []
+                        if len(peaks_neg_fin1) >= len(peaks_neg_fin2):
+                            peaks_neg_fin = peaks_neg_fin1
                         else:
-                            peaks_neg_fin=list(np.copy(peaks_neg_fin2))
-                        peaks_neg_fin=list(np.array(peaks_neg_fin)+peaks_neg_fin_early[i_n])
-
-                        if i_n!=(len(peaks_neg_fin_early)-2):
-                            peaks_neg_fin_rev.append(peaks_neg_fin_early[i_n+1])
+                            peaks_neg_fin = peaks_neg_fin2
+                        peaks_neg_fin = list(np.array(peaks_neg_fin) + left)
                         #print(peaks_neg_fin,'peaks_neg_fin')
-                        peaks_neg_fin_rev=peaks_neg_fin_rev+peaks_neg_fin
+
+                        if right < peaks_neg_fin_early[-1]:
+                            peaks_neg_fin_rev.append(right)
+                        peaks_neg_fin_rev.extend(peaks_neg_fin)
 
                     if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org):
                         peaks_neg_fin=list(np.sort(peaks_neg_fin_rev))
@@ -1673,21 +1687,20 @@ def return_boxes_of_images_by_order_of_reading_new(
             except:
                 logger.exception("cannot find peaks consistent with columns")
             #num_col, peaks_neg_fin = find_num_col(
-            #    regions_without_separators[splitter_y_new[i]:splitter_y_new[i+1],:],
+            #    regions_without_separators[top:bot,:],
             #    multiplier=7.0)
             x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ]
             x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ]
             cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ]
             cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ]
-            arg_org_hor_some=matrix_new[:,0][ (matrix_new[:,9]==0) ]
 
             if right2left_readingorder:
-                x_max_hor_some_new = regions_without_separators.shape[1] - x_min_hor_some
-                x_min_hor_some_new = regions_without_separators.shape[1] - x_max_hor_some
+                x_max_hor_some_new = width_tot - x_min_hor_some
+                x_min_hor_some_new = width_tot - x_max_hor_some
                 x_min_hor_some =list(np.copy(x_min_hor_some_new))
                 x_max_hor_some =list(np.copy(x_max_hor_some_new))
 
-            peaks_neg_tot=return_points_with_boundies(peaks_neg_fin,0, regions_without_separators[:,:].shape[1])
+            peaks_neg_tot = [0] + peaks_neg_fin + [width_tot]
             peaks_neg_tot_tables.append(peaks_neg_tot)
 
             reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \
@@ -1697,26 +1710,27 @@ def return_boxes_of_images_by_order_of_reading_new(
                     x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff)
 
             all_columns = set(range(len(peaks_neg_tot) - 1))
-            if ((reading_order_type==1) or
-                (reading_order_type==0 and
-                 (len(y_lines_without_mother)>=2 or there_is_sep_with_child==1))):
+            # print("all_columns", all_columns)
+            if (reading_order_type == 1 or
+                len(y_lines_without_mother) >= 2 or
+                there_is_sep_with_child == 1):
                 try:
-                    y_grenze = splitter_y_new[i] + 300
+                    y_grenze = top + 300
                     #check if there is a big separator in this y_mains_sep_ohne_grenzen
 
                     args_early_ys=np.arange(len(y_type_2))
                     #print(args_early_ys,'args_early_ys')
-                    #print(splitter_y_new[i], splitter_y_new[i+1])
+                    #print(top, bot)
 
-                    x_starting_up = x_starting[(y_type_2 > splitter_y_new[i]) &
+                    x_starting_up = x_starting[(y_type_2 > top) &
                                                (y_type_2 <= y_grenze)]
-                    x_ending_up = x_ending[(y_type_2 > splitter_y_new[i]) &
+                    x_ending_up = x_ending[(y_type_2 > top) &
                                            (y_type_2 <= y_grenze)]
-                    y_type_2_up = y_type_2[(y_type_2 > splitter_y_new[i]) &
+                    y_type_2_up = y_type_2[(y_type_2 > top) &
                                            (y_type_2 <= y_grenze)]
-                    y_diff_type_2_up = y_diff_type_2[(y_type_2 > splitter_y_new[i]) &
+                    y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) &
                                                      (y_type_2 <= y_grenze)]
-                    args_up = args_early_ys[(y_type_2 > splitter_y_new[i]) &
+                    args_up = args_early_ys[(y_type_2 > top) &
                                             (y_type_2 <= y_grenze)]
                     if len(y_type_2_up) > 0:
                         y_main_separator_up = y_type_2_up [(x_starting_up==0) &
@@ -1730,27 +1744,28 @@ def return_boxes_of_images_by_order_of_reading_new(
                             args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) ))
                             #print(args_to_be_kept,'args_to_be_kept')
                             boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
-                                          splitter_y_new[i], y_diff_main_separator_up.max()])
-                            splitter_y_new[i] = y_diff_main_separator_up.max()
+                                          top, y_diff_main_separator_up.max()])
+                            # dbg_plt(boxes[-1], "first box")
+                            top = y_diff_main_separator_up.max()
 
-                            #print(splitter_y_new[i],'splitter_y_new[i]')
+                            #print(top,'top')
                             y_type_2 = y_type_2[args_to_be_kept]
                             x_starting = x_starting[args_to_be_kept]
                             x_ending = x_ending[args_to_be_kept]
                             y_diff_type_2 = y_diff_type_2[args_to_be_kept]
 
                             #print('galdiha')
-                            y_grenze = splitter_y_new[i] + 200
+                            y_grenze = top + 200
                             args_early_ys2=np.arange(len(y_type_2))
-                            y_type_2_up=y_type_2[(y_type_2 > splitter_y_new[i]) &
+                            y_type_2_up=y_type_2[(y_type_2 > top) &
                                                  (y_type_2 <= y_grenze)]
-                            x_starting_up=x_starting[(y_type_2 > splitter_y_new[i]) &
+                            x_starting_up=x_starting[(y_type_2 > top) &
                                                      (y_type_2 <= y_grenze)]
-                            x_ending_up=x_ending[(y_type_2 > splitter_y_new[i]) &
+                            x_ending_up=x_ending[(y_type_2 > top) &
                                                  (y_type_2 <= y_grenze)]
-                            y_diff_type_2_up=y_diff_type_2[(y_type_2 > splitter_y_new[i]) &
+                            y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) &
                                                            (y_type_2 <= y_grenze)]
-                            args_up2=args_early_ys2[(y_type_2 > splitter_y_new[i]) &
+                            args_up2=args_early_ys2[(y_type_2 > top) &
                                                     (y_type_2 <= y_grenze)]
                             #print(y_type_2_up,x_starting_up,x_ending_up,'didid')
                             nodes_in = set()
@@ -1804,13 +1819,14 @@ def return_boxes_of_images_by_order_of_reading_new(
                                     pass
                                 #print('burdaydikh2')
 
-                    #int(splitter_y_new[i])
+                    #int(top)
                     y_lines_by_order=[]
                     x_start_by_order=[]
                     x_end_by_order=[]
-                    if (len(x_end_with_child_without_mother)==0 and reading_order_type==0) or reading_order_type==1:
-                        if reading_order_type==1:
-                            y_lines_by_order.append(splitter_y_new[i])
+                    if (reading_order_type == 1 or
+                        len(x_end_with_child_without_mother) == 0):
+                        if reading_order_type == 1:
+                            y_lines_by_order.append(top)
                             x_start_by_order.append(0)
                             x_end_by_order.append(len(peaks_neg_tot)-2)
                         else:
@@ -1823,8 +1839,8 @@ def return_boxes_of_images_by_order_of_reading_new(
                             columns_not_covered = list(all_columns - columns_covered_by_mothers)
                             y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) +
                                                                    len(x_start_without_mother),
-                                                                   dtype=int) * splitter_y_new[i])
-                            ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered))
+                                                                   dtype=int) * top)
+                            ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
                             ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                             x_starting = np.append(x_starting, np.array(columns_not_covered, int))
                             x_starting = np.append(x_starting, x_start_without_mother)
@@ -1839,22 +1855,15 @@ def return_boxes_of_images_by_order_of_reading_new(
                             ind_args_in_col=ind_args[x_starting==column]
                             #print('babali2')
                             #print(ind_args_in_col,'ind_args_in_col')
-                            ind_args_in_col=np.array(ind_args_in_col)
                             #print(len(y_type_2))
                             y_column=y_type_2[ind_args_in_col]
                             x_start_column=x_starting[ind_args_in_col]
                             x_end_column=x_ending[ind_args_in_col]
                             #print('babali3')
                             ind_args_col_sorted=np.argsort(y_column)
-                            y_col_sort=y_column[ind_args_col_sorted]
-                            x_start_column_sort=x_start_column[ind_args_col_sorted]
-                            x_end_column_sort=x_end_column[ind_args_col_sorted]
-                            #print('babali4')
-                            for ii in range(len(y_col_sort)):
-                                #print('babali5')
-                                y_lines_by_order.append(y_col_sort[ii])
-                                x_start_by_order.append(x_start_column_sort[ii])
-                                x_end_by_order.append(x_end_column_sort[ii]-1)
+                            y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                            x_start_by_order.extend(x_start_column[ind_args_col_sorted])
+                            x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
                     else:
                         #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
                         columns_covered_by_mothers = set()
@@ -1864,8 +1873,8 @@ def return_boxes_of_images_by_order_of_reading_new(
                                       x_end_without_mother[dj]))
                         columns_not_covered = list(all_columns - columns_covered_by_mothers)
                         y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother),
-                                                               dtype=int) * splitter_y_new[i])
-                        ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered))
+                                                               dtype=int) * top)
+                        ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
                         ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                         x_starting = np.append(x_starting, np.array(columns_not_covered, int))
                         x_starting = np.append(x_starting, x_start_without_mother)
@@ -1888,25 +1897,24 @@ def return_boxes_of_images_by_order_of_reading_new(
                         x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int)
                         for i_s_nc in columns_not_covered_child_no_mother:
                             if i_s_nc in x_start_with_child_without_mother:
+                                #print("i_s_nc", i_s_nc)
                                 x_end_biggest_column = \
                                     x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0]
                                 args_all_biggest_lines = ind_args[(x_starting==i_s_nc) &
                                                                   (x_ending==x_end_biggest_column)]
                                 y_column_nc = y_type_2[args_all_biggest_lines]
-                                x_start_column_nc = x_starting[args_all_biggest_lines]
-                                x_end_column_nc = x_ending[args_all_biggest_lines]
+                                #x_start_column_nc = x_starting[args_all_biggest_lines]
+                                #x_end_column_nc = x_ending[args_all_biggest_lines]
                                 y_column_nc = np.sort(y_column_nc)
                                 for i_c in range(len(y_column_nc)):
-                                    if i_c==(len(y_column_nc)-1):
-                                        ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) &
-                                                                              (y_type_2<splitter_y_new[i+1]) &
-                                                                              (x_starting>=i_s_nc) &
-                                                                              (x_ending<=x_end_biggest_column)]
-                                    else:
-                                        ind_all_lines_between_nm_wc=ind_args[(y_type_2>y_column_nc[i_c]) &
-                                                                              (y_type_2<y_column_nc[i_c+1]) &
-                                                                              (x_starting>=i_s_nc) &
-                                                                              (x_ending<=x_end_biggest_column)]
+                                    #print("i_c", i_c)
+                                    ind_all_lines_between_nm_wc = \
+                                        ind_args[(y_type_2 > y_column_nc[i_c]) &
+                                                 (y_type_2 < (y_column_nc[i_c+1]
+                                                              if i_c < len(y_column_nc)-1
+                                                              else bot)) &
+                                                 (x_starting >= i_s_nc) &
+                                                 (x_ending <= x_end_biggest_column)]
                                     y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc]
                                     x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc]
                                     x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc]
@@ -1965,78 +1973,58 @@ def return_boxes_of_images_by_order_of_reading_new(
                                         ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column]
                                         #print('babali2')
                                         #print(ind_args_in_col,'ind_args_in_col')
-                                        ind_args_in_col=np.array(ind_args_in_col)
                                         #print(len(y_type_2))
                                         y_column=y_all_between_nm_wc[ind_args_in_col]
                                         x_start_column=x_starting_all_between_nm_wc[ind_args_in_col]
                                         x_end_column=x_ending_all_between_nm_wc[ind_args_in_col]
                                         #print('babali3')
                                         ind_args_col_sorted=np.argsort(y_column)
-                                        y_col_sort=y_column[ind_args_col_sorted]
-                                        x_start_column_sort=x_start_column[ind_args_col_sorted]
-                                        x_end_column_sort=x_end_column[ind_args_col_sorted]
-                                        #print('babali4')
-                                        for ii in range(len(y_col_sort)):
-                                            #print('babali5')
-                                            y_lines_by_order.append(y_col_sort[ii])
-                                            x_start_by_order.append(x_start_column_sort[ii])
-                                            x_end_by_order.append(x_end_column_sort[ii]-1)
+                                        y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                                        x_start_by_order.extend(x_start_column[ind_args_col_sorted])
+                                        x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
                             else:
                                 #print(column,'column')
                                 ind_args_in_col=ind_args[x_starting==i_s_nc]
                                 #print('babali2')
                                 #print(ind_args_in_col,'ind_args_in_col')
-                                ind_args_in_col=np.array(ind_args_in_col)
                                 #print(len(y_type_2))
                                 y_column=y_type_2[ind_args_in_col]
                                 x_start_column=x_starting[ind_args_in_col]
                                 x_end_column=x_ending[ind_args_in_col]
                                 #print('babali3')
-                                ind_args_col_sorted=np.argsort(y_column)
-                                y_col_sort=y_column[ind_args_col_sorted]
-                                x_start_column_sort=x_start_column[ind_args_col_sorted]
-                                x_end_column_sort=x_end_column[ind_args_col_sorted]
-                                #print('babali4')
-                                for ii in range(len(y_col_sort)):
-                                    y_lines_by_order.append(y_col_sort[ii])
-                                    x_start_by_order.append(x_start_column_sort[ii])
-                                    x_end_by_order.append(x_end_column_sort[ii]-1)
-
+                                ind_args_col_sorted = np.argsort(y_column)
+                                y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                                x_start_by_order.extend(x_start_column[ind_args_col_sorted])
+                                x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
+
+                    y_lines_by_order = np.array(y_lines_by_order)
+                    x_start_by_order = np.array(x_start_by_order)
+                    x_end_by_order = np.array(x_end_by_order)
                     for il in range(len(y_lines_by_order)):
-                        y_copy = list(y_lines_by_order)
-                        x_start_copy = list(x_start_by_order)
-                        x_end_copy = list(x_end_by_order)
-
-                        #print(y_copy,'y_copy')
-                        y_itself=y_copy.pop(il)
-                        x_start_itself=x_start_copy.pop(il)
-                        x_end_itself=x_end_copy.pop(il)
-
-                        #print(y_copy,'y_copy2')
+                        #print(il, "il")
+                        y_itself = y_lines_by_order[il]
+                        x_start_itself = x_start_by_order[il]
+                        x_end_itself = x_end_by_order[il]
                         for column in range(int(x_start_itself), int(x_end_itself)+1):
                             #print(column,'cols')
-                            y_in_cols=[]
-                            for yic in range(len(y_copy)):
-                                #print('burda')
-                                if (y_copy[yic]>y_itself and
-                                    column>=x_start_copy[yic] and
-                                    column<=x_end_copy[yic]):
-                                    y_in_cols.append(y_copy[yic])
+                            y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) &
+                                                         (column >= x_start_by_order) &
+                                                         (column <= x_end_by_order)]
+                            #print('burda')
+                            y_down = y_in_cols.min(initial=bot)
                             #print('burda2')
                             #print(y_in_cols,'y_in_cols')
-                            if len(y_in_cols)>0:
-                                y_down=np.min(y_in_cols)
-                            else:
-                                y_down=splitter_y_new[i+1]
                             #print(y_itself,'y_itself')
                             boxes.append([peaks_neg_tot[column],
                                           peaks_neg_tot[column+1],
                                           y_itself,
                                           y_down])
+                            # dbg_plt(boxes[-1], "A column %d box" % (column + 1))
                 except:
                     logger.exception("cannot assign boxes")
                     boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
-                                  splitter_y_new[i], splitter_y_new[i+1]])
+                                  top, bot])
+                    # dbg_plt(boxes[-1], "fallback box")
             else:
                 y_lines_by_order=[]
                 x_start_by_order=[]
@@ -2050,8 +2038,8 @@ def return_boxes_of_images_by_order_of_reading_new(
                     columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col)
 
                     y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1,
-                                                           dtype=int) * splitter_y_new[i])
-                    ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered))
+                                                           dtype=int) * top)
+                    ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
                     ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                     x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
                     x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
@@ -2064,8 +2052,8 @@ def return_boxes_of_images_by_order_of_reading_new(
                 else:
                     columns_not_covered = list(all_columns)
                     y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered),
-                                                           dtype=int) * splitter_y_new[i])
-                    ##y_lines_by_order = np.append(y_lines_by_order, [splitter_y_new[i]] * len(columns_not_covered))
+                                                           dtype=int) * top)
+                    ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
                     ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                     x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
                     x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
@@ -2075,71 +2063,64 @@ def return_boxes_of_images_by_order_of_reading_new(
                 for column in range(len(peaks_neg_tot)-1):
                     #print(column,'column')
                     ind_args_in_col=ind_args[x_starting==column]
-                    ind_args_in_col=np.array(ind_args_in_col)
                     #print(len(y_type_2))
                     y_column=y_type_2[ind_args_in_col]
                     x_start_column=x_starting[ind_args_in_col]
                     x_end_column=x_ending[ind_args_in_col]
 
-                    ind_args_col_sorted=np.argsort(y_column)
-                    y_col_sort=y_column[ind_args_col_sorted]
-                    x_start_column_sort=x_start_column[ind_args_col_sorted]
-                    x_end_column_sort=x_end_column[ind_args_col_sorted]
-                    #print('babali4')
-                    for ii in range(len(y_col_sort)):
-                        #print('babali5')
-                        y_lines_by_order.append(y_col_sort[ii])
-                        x_start_by_order.append(x_start_column_sort[ii])
-                        x_end_by_order.append(x_end_column_sort[ii]-1)
+                    ind_args_col_sorted = np.argsort(y_column)
+                    y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                    x_start_by_order.extend(x_start_column[ind_args_col_sorted])
+                    x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
 
+                y_lines_by_order = np.array(y_lines_by_order)
+                x_start_by_order = np.array(x_start_by_order)
+                x_end_by_order = np.array(x_end_by_order)
                 for il in range(len(y_lines_by_order)):
-                    y_copy = list(y_lines_by_order)
-                    x_start_copy = list(x_start_by_order)
-                    x_end_copy = list(x_end_by_order)
-
-                    #print(y_copy,'y_copy')
-                    y_itself=y_copy.pop(il)
-                    x_start_itself=x_start_copy.pop(il)
-                    x_end_itself=x_end_copy.pop(il)
-
+                    #print(il, "il")
+                    y_itself = y_lines_by_order[il]
+                    #print(y_itself,'y_itself')
+                    x_start_itself = x_start_by_order[il]
+                    x_end_itself = x_end_by_order[il]
                     for column in range(x_start_itself, x_end_itself+1):
                         #print(column,'cols')
-                        y_in_cols=[]
-                        for yic in range(len(y_copy)):
-                            #print('burda')
-                            if (y_copy[yic]>y_itself and
-                                column>=x_start_copy[yic] and
-                                column<=x_end_copy[yic]):
-                                y_in_cols.append(y_copy[yic])
+                        y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) &
+                                                     (column >= x_start_by_order) &
+                                                     (column <= x_end_by_order)]
                         #print('burda2')
                         #print(y_in_cols,'y_in_cols')
-                        if len(y_in_cols)>0:
-                            y_down=np.min(y_in_cols)
-                        else:
-                            y_down=splitter_y_new[i+1]
-                        #print(y_itself,'y_itself')
+                        y_down = y_in_cols.min(initial=bot)
+                        #print(y_down,'y_down')
                         boxes.append([peaks_neg_tot[column],
                                       peaks_neg_tot[column+1],
                                       y_itself,
                                       y_down])
+                        # dbg_plt(boxes[-1], "B column %d box" % (column + 1))
         #else:
-            #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,splitter_y_new[i],splitter_y_new[i+1]])
+            #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot])
 
     if right2left_readingorder:
         peaks_neg_tot_tables_new = []
         if len(peaks_neg_tot_tables)>=1:
             for peaks_tab_ind in peaks_neg_tot_tables:
-                peaks_neg_tot_tables_ind = regions_without_separators.shape[1] - np.array(peaks_tab_ind)
+                peaks_neg_tot_tables_ind = width_tot - np.array(peaks_tab_ind)
                 peaks_neg_tot_tables_ind = list(peaks_neg_tot_tables_ind[::-1])
                 peaks_neg_tot_tables_new.append(peaks_neg_tot_tables_ind)
 
         for i in range(len(boxes)):
-            x_start_new = regions_without_separators.shape[1] - boxes[i][1]
-            x_end_new = regions_without_separators.shape[1] - boxes[i][0]
+            x_start_new = width_tot - boxes[i][1]
+            x_end_new = width_tot - boxes[i][0]
             boxes[i][0] = x_start_new
             boxes[i][1] = x_end_new
         peaks_neg_tot_tables = peaks_neg_tot_tables_new
 
+    # show final xy-cut
+    # plt.imshow(regions_without_separators)
+    # for xmin, xmax, ymin, ymax in boxes:
+    #     plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
+    #                                           fill=False, linewidth=1, edgecolor='r'))
+    # plt.show()
+
     logger.debug('exit return_boxes_of_images_by_order_of_reading_new')
     return boxes, peaks_neg_tot_tables
 

From 0fc4b2535dc005612406cd4ffbf2471a5b4e1485 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 20 Oct 2025 16:47:35 +0200
Subject: [PATCH 10/21] `return_boxes_of_images_by_order_of_reading_new`: fix
 no-mother case

- when handling lines without mother,
  and biggest line already accounts for all columns,
  but some are too close to the top and therefore must be removed,
  avoid invalidating `biggest` index, causing `IndexError`
- remove try-catch (now unnecessary)
- array instead of list operations
---
 src/eynollah/utils/__init__.py | 62 ++++++++++++++++------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 698b0bd..b331cab 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1919,54 +1919,50 @@ def return_boxes_of_images_by_order_of_reading_new(
                                     x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc]
                                     x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc]
 
-                                    x_diff_all_between_nm_wc = x_ending_all_between_nm_wc - x_starting_all_between_nm_wc
-                                    if len(x_diff_all_between_nm_wc)>0:
-                                        biggest=np.argmax(x_diff_all_between_nm_wc)
-
                                     columns_covered_by_mothers = set()
-                                    for dj in range(len(x_starting_all_between_nm_wc)):
+                                    for dj in range(len(ind_all_lines_between_nm_wc)):
                                         columns_covered_by_mothers.update(
                                             range(x_starting_all_between_nm_wc[dj],
                                                   x_ending_all_between_nm_wc[dj]))
                                     child_columns = set(range(i_s_nc, x_end_biggest_column))
                                     columns_not_covered = list(child_columns - columns_covered_by_mothers)
 
-                                    should_longest_line_be_extended=0
-                                    if (len(x_diff_all_between_nm_wc) > 0 and
-                                        set(list(range(x_starting_all_between_nm_wc[biggest],
-                                                        x_ending_all_between_nm_wc[biggest])) +
-                                            list(columns_not_covered)) != child_columns):
-                                        should_longest_line_be_extended=1
-                                        index_lines_so_close_to_top_separator = \
-                                            np.arange(len(y_all_between_nm_wc))[(y_all_between_nm_wc>y_column_nc[i_c]) &
-                                                                                (y_all_between_nm_wc<=(y_column_nc[i_c]+500))]
-                                        if len(index_lines_so_close_to_top_separator) > 0:
-                                            indexes_remained_after_deleting_closed_lines= \
-                                                np.array(list(set(list(range(len(y_all_between_nm_wc)))) -
-                                                              set(list(index_lines_so_close_to_top_separator))))
-                                            if len(indexes_remained_after_deleting_closed_lines) > 0:
+                                    if len(ind_all_lines_between_nm_wc):
+                                        biggest = np.argmax(x_ending_all_between_nm_wc -
+                                                            x_starting_all_between_nm_wc)
+                                        if columns_covered_by_mothers == set(
+                                                range(x_starting_all_between_nm_wc[biggest],
+                                                      x_ending_all_between_nm_wc[biggest])):
+                                            # biggest accounts for all columns alone,
+                                            # longest line should be extended
+                                            lines_so_close_to_top_separator = \
+                                                ((y_all_between_nm_wc > y_column_nc[i_c]) &
+                                                 (y_all_between_nm_wc <= y_column_nc[i_c] + 500))
+                                            if (np.count_nonzero(lines_so_close_to_top_separator) and
+                                                np.count_nonzero(lines_so_close_to_top_separator) <
+                                                len(ind_all_lines_between_nm_wc)):
                                                 y_all_between_nm_wc = \
-                                                    y_all_between_nm_wc[indexes_remained_after_deleting_closed_lines]
+                                                    y_all_between_nm_wc[~lines_so_close_to_top_separator]
                                                 x_starting_all_between_nm_wc = \
-                                                    x_starting_all_between_nm_wc[indexes_remained_after_deleting_closed_lines]
+                                                    x_starting_all_between_nm_wc[~lines_so_close_to_top_separator]
                                                 x_ending_all_between_nm_wc = \
-                                                    x_ending_all_between_nm_wc[indexes_remained_after_deleting_closed_lines]
-
-                                        y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c])
-                                        x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc)
-                                        x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column)
+                                                    x_ending_all_between_nm_wc[~lines_so_close_to_top_separator]
 
-                                    if len(x_diff_all_between_nm_wc) > 0:
-                                        try:
+                                            y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c])
+                                            x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc)
+                                            x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column)
+                                        else:
                                             y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c])
                                             x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest])
                                             x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest])
-                                        except:
-                                            logger.exception("cannot append")
 
-                                    y_all_between_nm_wc = np.append(y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered))
-                                    x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, np.array(columns_not_covered, int))
-                                    x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1)
+                                    if len(columns_not_covered):
+                                        y_all_between_nm_wc = np.append(
+                                            y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered))
+                                        x_starting_all_between_nm_wc = np.append(
+                                            x_starting_all_between_nm_wc, np.array(columns_not_covered, int))
+                                        x_ending_all_between_nm_wc = np.append(
+                                            x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1)
 
                                     ind_args_between=np.arange(len(x_ending_all_between_nm_wc))
                                     for column in range(int(i_s_nc), int(x_end_biggest_column)):

From e2dfec75fbefe3e5aeffd71a7a61eab6092f6c92 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 01:19:20 +0200
Subject: [PATCH 11/21] 
 `return_x_start_end_mothers_childs_and_type_of_reading_order`: simplify and
 document
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- simplify
- rename identifiers to make readable:
  - `y_sep` → `y_mid` (because the cy gets passed)
  - `y_diff` → `y_max` (because the ymax gets passed)
- array instead of list operations
- add docstring and in-line comments
- return (zero-length) numpy array instead of empty list
---
 src/eynollah/eynollah.py       |  10 +-
 src/eynollah/utils/__init__.py | 386 +++++++++++++++++----------------
 2 files changed, 202 insertions(+), 194 deletions(-)

diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py
index 9412861..08ffed7 100644
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@@ -2507,6 +2507,7 @@ def match_boxes(only_centers: bool):
                          My_main[ii] < box[3])):
                         arg_text_con_main[ii] = jj
                         check_if_textregion_located_in_a_box = True
+                        #print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers)
                         break
                 if not check_if_textregion_located_in_a_box:
                     dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
@@ -2514,6 +2515,7 @@ def match_boxes(only_centers: bool):
                                          (boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
                     ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
                     arg_text_con_main[ii] = ind_min
+                    #print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers)
             args_contours_main = np.arange(len(contours_only_text_parent))
             order_by_con_main = np.zeros_like(arg_text_con_main)
 
@@ -2531,6 +2533,7 @@ def match_boxes(only_centers: bool):
                          My_head[ii] < box[3])):
                         arg_text_con_head[ii] = jj
                         check_if_textregion_located_in_a_box = True
+                        #print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers)
                         break
                 if not check_if_textregion_located_in_a_box:
                     dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0)
@@ -2538,6 +2541,7 @@ def match_boxes(only_centers: bool):
                                          (boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1]))
                     ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
                     arg_text_con_head[ii] = ind_min
+                    #print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers)
             args_contours_head = np.arange(len(contours_only_text_parent_h))
             order_by_con_head = np.zeros_like(arg_text_con_head)
 
@@ -2587,7 +2591,7 @@ def match_boxes(only_centers: bool):
         try:
             results = match_boxes(False)
         except Exception as why:
-            self.logger.error(why)
+            self.logger.exception(why)
             results = match_boxes(True)
 
         self.logger.debug("exit do_order_of_regions")
@@ -2976,7 +2980,7 @@ def run_graphics_and_columns_light(
                                      max(self.num_col_lower or num_col_classifier,
                                          num_col_classifier))
         except Exception as why:
-            self.logger.error(why)
+            self.logger.exception(why)
             num_col = None
         #print("inside graphics 3 ", time.time() - t_in_gr)
         return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines,
@@ -3044,7 +3048,7 @@ def run_graphics_and_columns(
             if not num_column_is_classified:
                 num_col_classifier = num_col + 1
         except Exception as why:
-            self.logger.error(why)
+            self.logger.exception(why)
             num_col = None
         return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines,
                 text_regions_p_1, cont_page, table_prediction)
diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index b331cab..f1a8aae 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -33,226 +33,229 @@ def pairwise(iterable):
         a = b
 
 def return_x_start_end_mothers_childs_and_type_of_reading_order(
-        x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, cy_hor_diff):
+        x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some):
+    """
+    Analyse which separators overlap multiple column candidates,
+    and how they overlap each other.
+
+    Ignore separators not spanning multiple columns.
+
+    For the separators to be returned, try to join them when they are directly
+    adjacent horizontally but nearby vertically (and thus mutually compatible).
+    Also, mark any separators that already span the full width.
+
+    Furthermore, identify which pairs of (unjoined) separators span subsets of columns
+    of each other (disregarding vertical positions). Referring, respectively, to the
+    superset separators as  "mothers" and to the subset separators as "children",
+    retrieve information on which columns are spanned by separators with no mother,
+    and which columns are spanned by their children (if any).
+
+    Moreover, determine if there is any (column) overlap among the multi-span separators
+    with no mother, specifically (and thus, no simple box separation is possible).
+
+    Arguments:
+        * the x start column index of the raw separators
+        * the x end column index of the raw separators
+        * the y center coordinate of the raw separators
+        * the x column coordinates
+        * the y end coordinate of the raw separators
+
+    Returns:
+        a tuple of:
+        * whether any top-level (no-mother) multi-span separators overlap each other
+        * the x start column index of the resulting multi-span separators
+        * the x end column index of the resulting multi-span separators
+        * the y center coordinate of the resulting multi-span separators
+        * the y end coordinate of the resulting multi-span separators
+        * the y center (for 1 representative) of the top-level (no-mother) multi-span separators
+        * the x start column index of the top-level (no-mother) multi-span separators
+        * the x end column index of the top-level (no-mother) multi-span separators
+        * whether any multi-span separators have super-spans of other (child) multi-span separators
+        * the y center (for 1 representative) of the top-level (no-mother) multi-span separators
+          which have super-spans of other (child) multi-span separators
+        * the x start column index of the top-level multi-span separators
+          which have super-spans of other (child) multi-span separators
+        * the x end column index of the top-level multi-span separators
+          which have super-spans of other (child) multi-span separators
+        * indexes of multi-span separators with full-width span
+    """
 
     x_start=[]
     x_end=[]
-    kind=[]#if covers 2 and more than 2 columns set it to 1 otherwise 0
     len_sep=[]
-    y_sep=[]
-    y_diff=[]
+    y_mid=[]
+    y_max=[]
     new_main_sep_y=[]
-
     indexer=0
     for i in range(len(x_min_hor_some)):
-        starting=x_min_hor_some[i]-peak_points
-        starting=starting[starting>=0]
-        min_start=np.argmin(starting)
-        ending=peak_points-x_max_hor_some[i]
-        len_ending_neg=len(ending[ending<=0])
-
-        ending=ending[ending>0]
-        max_end=np.argmin(ending)+len_ending_neg
+        #print(indexer, "%d:%d" % (x_min_hor_some[i], x_max_hor_some[i]), cy_hor_some[i])
+        starting = x_min_hor_some[i] - peak_points
+        min_start = np.flatnonzero(starting >= 0)[-1] # last left-of
+        ending = x_max_hor_some[i] - peak_points
+        max_end = np.flatnonzero(ending < 0)[0] # first right-of
+        #print(indexer, "%d:%d" % (min_start, max_end))
 
         if (max_end-min_start)>=2:
+            # column range of separator spans more than one column candidate
             if (max_end-min_start)==(len(peak_points)-1):
+                # all columns (i.e. could be true new y splitter)
                 new_main_sep_y.append(indexer)
 
             #print((max_end-min_start),len(peak_points),'(max_end-min_start)')
-            y_sep.append(cy_hor_some[i])
-            y_diff.append(cy_hor_diff[i])
+            y_mid.append(cy_hor_some[i])
+            y_max.append(y_max_hor_some[i])
             x_end.append(max_end)
-
-            x_start.append( min_start)
-
+            x_start.append(min_start)
             len_sep.append(max_end-min_start)
-            if max_end==min_start+1:
-                kind.append(0)
-            else:
-                kind.append(1)
-
             indexer+=1
+    #print(x_start,'x_start')
+    #print(x_end,'x_end')
 
     x_start_returned = np.array(x_start, dtype=int)
     x_end_returned = np.array(x_end, dtype=int)
-    y_sep_returned = np.array(y_sep, dtype=int)
-    y_diff_returned = np.array(y_diff, dtype=int)
-
-    all_args_uniq = contours_in_same_horizon(y_sep_returned)
-    args_to_be_unified=[]
-    y_unified=[]
-    y_diff_unified=[]
-    x_s_unified=[]
-    x_e_unified=[]
-    if len(all_args_uniq)>0:
-        #print('burda')
-        if type(all_args_uniq[0]) is list:
-            for dd in range(len(all_args_uniq)):
-                if len(all_args_uniq[dd])==2:
-                    x_s_same_hor=np.array(x_start_returned)[all_args_uniq[dd]]
-                    x_e_same_hor=np.array(x_end_returned)[all_args_uniq[dd]]
-                    y_sep_same_hor=np.array(y_sep_returned)[all_args_uniq[dd]]
-                    y_diff_same_hor=np.array(y_diff_returned)[all_args_uniq[dd]]
-                    #print('burda2')
-                    if (x_s_same_hor[0]==x_e_same_hor[1]-1 or
-                        x_s_same_hor[1]==x_e_same_hor[0]-1 and
-                        x_s_same_hor[0]!=x_s_same_hor[1] and
-                        x_e_same_hor[0]!=x_e_same_hor[1]):
-                        #print('burda3')
-                        for arg_in in all_args_uniq[dd]:
-                            #print(arg_in,'arg_in')
-                            args_to_be_unified.append(arg_in)
-                        y_selected=np.min(y_sep_same_hor)
-                        y_diff_selected=np.max(y_diff_same_hor)
-                        x_s_selected=np.min(x_s_same_hor)
-                        x_e_selected=np.max(x_e_same_hor)
-
-                        x_s_unified.append(x_s_selected)
-                        x_e_unified.append(x_e_selected)
-                        y_unified.append(y_selected)
-                        y_diff_unified.append(y_diff_selected)
-                    #print(x_s_same_hor,'x_s_same_hor')
-                    #print(x_e_same_hor[:]-1,'x_e_same_hor')
-                    #print('#############################')
-    #print(x_s_unified,'y_selected')
-    #print(x_e_unified,'x_s_selected')
-    #print(y_unified,'x_e_same_hor')
-
-    args_lines_not_unified=list( set(range(len(y_sep_returned)))-set(args_to_be_unified) )
-    #print(args_lines_not_unified,'args_lines_not_unified')
-
-    x_start_returned_not_unified=list( np.array(x_start_returned)[args_lines_not_unified] )
-    x_end_returned_not_unified=list( np.array(x_end_returned)[args_lines_not_unified] )
-    y_sep_returned_not_unified=list (np.array(y_sep_returned)[args_lines_not_unified] )
-    y_diff_returned_not_unified=list (np.array(y_diff_returned)[args_lines_not_unified] )
-
-    for dv in range(len(y_unified)):
-        y_sep_returned_not_unified.append(y_unified[dv])
-        y_diff_returned_not_unified.append(y_diff_unified[dv])
-        x_start_returned_not_unified.append(x_s_unified[dv])
-        x_end_returned_not_unified.append(x_e_unified[dv])
-
-    #print(y_sep_returned,'y_sep_returned')
+    y_mid_returned = np.array(y_mid, dtype=int)
+    y_max_returned = np.array(y_max, dtype=int)
+    #print(y_mid_returned,'y_mid_returned')
     #print(x_start_returned,'x_start_returned')
     #print(x_end_returned,'x_end_returned')
 
-    x_start_returned = np.array(x_start_returned_not_unified, dtype=int)
-    x_end_returned = np.array(x_end_returned_not_unified, dtype=int)
-    y_sep_returned = np.array(y_sep_returned_not_unified, dtype=int)
-    y_diff_returned = np.array(y_diff_returned_not_unified, dtype=int)
-
-    #print(y_sep_returned,'y_sep_returned2')
+    # join/elongate separators if follow-up x and similar y
+    sep_pairs = contours_in_same_horizon(y_mid_returned)
+    if len(sep_pairs):
+        #print('burda')
+        args_to_be_unified = set()
+        y_mid_unified = []
+        y_max_unified = []
+        x_start_unified = []
+        x_end_unified = []
+        for pair in sep_pairs:
+            if (not np.array_equal(*x_start_returned[pair]) and
+                not np.array_equal(*x_end_returned[pair]) and
+                # immediately adjacent columns?
+                np.diff(x_end_returned[pair] -
+                        x_start_returned[pair])[0] in [1, -1]):
+
+                args_to_be_unified.union(set(pair))
+                y_mid_unified.append(np.min(y_mid_returned[pair]))
+                y_max_unified.append(np.max(y_max_returned[pair]))
+                x_start_unified.append(np.min(x_start_returned[pair]))
+                x_end_unified.append(np.max(x_end_returned[pair]))
+                #print(pair,'pair')
+                #print(x_start_returned[pair],'x_s_same_hor')
+                #print(x_end_returned[pair],'x_e_same_hor')
+        #print(y_mid_unified,'y_mid_unified')
+        #print(y_max_unified,'y_max_unified')
+        #print(x_start_unified,'x_s_unified')
+        #print(x_end_unified,'x_e_selected')
+        #print('#############################')
+
+        if len(y_mid_unified):
+            args_lines_not_unified = np.setdiff1d(np.arange(len(y_mid_returned)),
+                                                  list(args_to_be_unified), assume_unique=True)
+            #print(args_lines_not_unified,'args_lines_not_unified')
+            x_start_returned = np.append(x_start_returned[args_lines_not_unified],
+                                         x_start_unified, axis=0)
+            x_end_returned = np.append(x_end_returned[args_lines_not_unified],
+                                       x_end_unified, axis=0)
+            y_mid_returned = np.append(y_mid_returned[args_lines_not_unified],
+                                       y_mid_unified, axis=0)
+            y_max_returned = np.append(y_max_returned[args_lines_not_unified],
+                                        y_max_unified, axis=0)
+    #print(y_mid_returned,'y_mid_returned2')
     #print(x_start_returned,'x_start_returned2')
     #print(x_end_returned,'x_end_returned2')
-    #print(new_main_sep_y,'new_main_sep_y')
 
+    #print(new_main_sep_y,'new_main_sep_y')
     #print(x_start,'x_start')
     #print(x_end,'x_end')
-    if len(new_main_sep_y)>0:
-
-        min_ys=np.min(y_sep)
-        max_ys=np.max(y_sep)
-
-        y_mains=[]
-        y_mains.append(min_ys)
-        y_mains_sep_ohne_grenzen=[]
-
-        for ii in range(len(new_main_sep_y)):
-            y_mains.append(y_sep[new_main_sep_y[ii]])
-            y_mains_sep_ohne_grenzen.append(y_sep[new_main_sep_y[ii]])
-
-        y_mains.append(max_ys)
-
-        y_mains_sorted=np.sort(y_mains)
-        diff=np.diff(y_mains_sorted)
-        argm=np.argmax(diff)
-
-        y_min_new=y_mains_sorted[argm]
-        y_max_new=y_mains_sorted[argm+1]
-
-        #print(y_min_new,'y_min_new')
-        #print(y_max_new,'y_max_new')
-        #print(y_sep[new_main_sep_y[0]],y_sep,'yseps')
+    x_start = np.array(x_start)
+    x_end = np.array(x_end)
+    y_mid = np.array(y_mid)
+    if len(new_main_sep_y):
+        # some full-width multi-span separators exist, so
+        # restrict the y range of separators to search for
+        # mutual overlaps to only those within the largest
+        # y strip between adjacent multi-span separators
+        # that involve at least one such full-width seps.
+        # (does not affect the separators to be returned)
+        min_ys=np.min(y_mid)
+        max_ys=np.max(y_mid)
+        #print(min_ys,'min_ys')
+        #print(max_ys,'max_ys')
+
+        y_mains0 = list(y_mid[new_main_sep_y])
+        y_mains = [min_ys] + y_mains0 + [max_ys]
+
+        y_mains = np.sort(y_mains)
+        argm = np.argmax(np.diff(y_mains))
+        y_mid_new = y_mains[argm]
+        y_mid_next_new = y_mains[argm + 1]
+
+        #print(y_mid_new,argm,'y_mid_new')
+        #print(y_mid_next_new,argm+1,'y_mid_next_new')
+        #print(y_mid[new_main_sep_y],new_main_sep_y,'yseps')
         x_start=np.array(x_start)
         x_end=np.array(x_end)
-        kind=np.array(kind)
-        y_sep=np.array(y_sep)
-        if (y_min_new in y_mains_sep_ohne_grenzen and
-            y_max_new in y_mains_sep_ohne_grenzen):
-            x_start=x_start[(y_sep>y_min_new) & (y_sep<y_max_new)]
-            x_end=x_end[(y_sep>y_min_new) & (y_sep<y_max_new)]
-            kind=kind[(y_sep>y_min_new) & (y_sep<y_max_new)]
-            y_sep=y_sep[(y_sep>y_min_new) & (y_sep<y_max_new)]
-        elif (y_min_new in y_mains_sep_ohne_grenzen and
-              y_max_new not in y_mains_sep_ohne_grenzen):
-            #print('burda')
-            x_start=x_start[(y_sep>y_min_new) & (y_sep<=y_max_new)]
-            #print('burda1')
-            x_end=x_end[(y_sep>y_min_new) & (y_sep<=y_max_new)]
-            #print('burda2')
-            kind=kind[(y_sep>y_min_new) & (y_sep<=y_max_new)]
-            y_sep=y_sep[(y_sep>y_min_new) & (y_sep<=y_max_new)]
-        elif (y_min_new not in y_mains_sep_ohne_grenzen and
-              y_max_new in y_mains_sep_ohne_grenzen):
-            x_start=x_start[(y_sep>=y_min_new) & (y_sep<y_max_new)]
-            x_end=x_end[(y_sep>=y_min_new) & (y_sep<y_max_new)]
-            kind=kind[(y_sep>=y_min_new) & (y_sep<y_max_new)]
-            y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<y_max_new)]
+        y_mid=np.array(y_mid)
+        # iff either boundary is itself not a full-width separator,
+        # then include it in the range of separators to be kept
+        if y_mid_new in y_mains0:
+            where = y_mid > y_mid_new
+        else:
+            where = y_mid >= y_mid_new
+        if y_mid_next_new in y_mains0:
+            where &= y_mid < y_mid_next_new
         else:
-            x_start=x_start[(y_sep>=y_min_new) & (y_sep<=y_max_new)]
-            x_end=x_end[(y_sep>=y_min_new) & (y_sep<=y_max_new)]
-            kind=kind[(y_sep>=y_min_new) & (y_sep<=y_max_new)]
-            y_sep=y_sep[(y_sep>=y_min_new) & (y_sep<=y_max_new)]
+            where &= y_mid <= y_mid_next_new
+        x_start = x_start[where]
+        x_end = x_end[where]
+        y_mid = y_mid[where]
     #print(x_start,'x_start')
     #print(x_end,'x_end')
-    #print(len_sep)
 
+    # remove redundant separators that span the same columns
+    # (keeping only 1 representative each)
     deleted = set()
-    for i in range(len(x_start)-1):
-        nodes_i=set(range(x_start[i],x_end[i]+1))
-        for j in range(i+1,len(x_start)):
-            if nodes_i==set(range(x_start[j],x_end[j]+1)):
-                deleted.add(j)
-    #print(np.unique(deleted))
-
+    for index_i in range(len(x_start) - 1):
+        nodes_i = set(range(x_start[index_i], x_end[index_i] + 1))
+        #print(nodes_i, "nodes_i")
+        for index_j in range(index_i + 1, len(x_start)):
+            nodes_j = set(range(x_start[index_j], x_end[index_j] + 1))
+            #print(nodes_j, "nodes_j")
+            if nodes_i == nodes_j:
+                deleted.add(index_j)
+    #print(deleted,"deleted")
     remained_sep_indexes = set(range(len(x_start))) - deleted
     #print(remained_sep_indexes,'remained_sep_indexes')
-    mother=[]#if it has mother
-    child=[]
+
+    # determine which separators span which columns
+    mother = [] # whether the respective separator has a mother separator
+    child = [] # whether the respective separator has a child separator
     for index_i in remained_sep_indexes:
         have_mother=0
         have_child=0
-        nodes_ind=set(range(x_start[index_i],x_end[index_i]+1))
+        nodes_i = set(range(x_start[index_i], x_end[index_i] + 1))
         for index_j in remained_sep_indexes:
-            nodes_ind_j=set(range(x_start[index_j],x_end[index_j]+1))
-            if nodes_ind<nodes_ind_j:
+            nodes_j = set(range(x_start[index_j], x_end[index_j] + 1))
+            if nodes_i < nodes_j:
                 have_mother=1
-            if nodes_ind>nodes_ind_j:
+            if nodes_i > nodes_j:
                 have_child=1
         mother.append(have_mother)
         child.append(have_child)
-
-    #print(mother,'mother')
-    #print(len(remained_sep_indexes))
-    #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_sep),'lens')
-    y_lines_without_mother=[]
-    x_start_without_mother=[]
-    x_end_without_mother=[]
-
-    y_lines_with_child_without_mother=[]
-    x_start_with_child_without_mother=[]
-    x_end_with_child_without_mother=[]
+    #print(mother, "mother")
+    #print(child, "child")
 
     mother = np.array(mother)
     child = np.array(child)
     #print(mother,'mother')
     #print(child,'child')
     remained_sep_indexes = np.array(list(remained_sep_indexes))
-    x_start = np.array(x_start)
-    x_end = np.array(x_end)
-    y_sep = np.array(y_sep)
+    #print(len(remained_sep_indexes))
+    #print(len(remained_sep_indexes),len(x_start),len(x_end),len(y_mid),'lens')
 
-    if len(remained_sep_indexes)>1:
+    reading_order_type = 0
+    if len(remained_sep_indexes):
         #print(np.array(remained_sep_indexes),'np.array(remained_sep_indexes)')
         #print(np.array(mother),'mother')
         remained_sep_indexes_without_mother = remained_sep_indexes[mother==0]
@@ -262,52 +265,53 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
 
         x_end_with_child_without_mother = x_end[remained_sep_indexes_with_child_without_mother]
         x_start_with_child_without_mother = x_start[remained_sep_indexes_with_child_without_mother]
-        y_lines_with_child_without_mother = y_sep[remained_sep_indexes_with_child_without_mother]
+        y_mid_with_child_without_mother = y_mid[remained_sep_indexes_with_child_without_mother]
 
-        reading_order_type=0
         x_end_without_mother = x_end[remained_sep_indexes_without_mother]
         x_start_without_mother = x_start[remained_sep_indexes_without_mother]
-        y_lines_without_mother = y_sep[remained_sep_indexes_without_mother]
+        y_mid_without_mother = y_mid[remained_sep_indexes_without_mother]
 
         if len(remained_sep_indexes_without_mother)>=2:
             for i in range(len(remained_sep_indexes_without_mother)-1):
-                nodes_i=set(range(x_start[remained_sep_indexes_without_mother[i]],
-                                  x_end[remained_sep_indexes_without_mother[i]]
-                                  # + 1
-                                  ))
-                for j in range(i+1,len(remained_sep_indexes_without_mother)):
-                    nodes_j=set(range(x_start[remained_sep_indexes_without_mother[j]],
-                                      x_end[remained_sep_indexes_without_mother[j]]
-                                      # + 1
-                                      ))
+                index_i = remained_sep_indexes_without_mother[i]
+                nodes_i = set(range(x_start[index_i], x_end[index_i])) #  + 1
+                #print(index_i, nodes_i, "nodes_i without mother")
+                for j in range(i + 1, len(remained_sep_indexes_without_mother)):
+                    index_j = remained_sep_indexes_without_mother[j]
+                    nodes_j = set(range(x_start[index_j], x_end[index_j])) #  + 1
+                    #print(index_j, nodes_j, "nodes_j without mother")
                     if nodes_i - nodes_j != nodes_i:
+                        #print("type=1")
                         reading_order_type = 1
     else:
-        reading_order_type = 0
-    #print(reading_order_type,'javab')
-    #print(y_lines_with_child_without_mother,'y_lines_with_child_without_mother')
+        y_mid_without_mother = np.zeros(0, int)
+        x_start_without_mother = np.zeros(0, int)
+        x_end_without_mother = np.zeros(0, int)
+        y_mid_with_child_without_mother = np.zeros(0, int)
+        x_start_with_child_without_mother = np.zeros(0, int)
+        x_end_with_child_without_mother = np.zeros(0, int)
+
+    #print(reading_order_type,'reading_order_type')
+    #print(y_mid_with_child_without_mother,'y_mid_with_child_without_mother')
     #print(x_start_with_child_without_mother,'x_start_with_child_without_mother')
     #print(x_end_with_child_without_mother,'x_end_with_hild_without_mother')
 
     len_sep_with_child = len(child[child==1])
-
     #print(len_sep_with_child,'len_sep_with_child')
     there_is_sep_with_child = 0
     if len_sep_with_child >= 1:
         there_is_sep_with_child = 1
-    #print(all_args_uniq,'all_args_uniq')
-    #print(args_to_be_unified,'args_to_be_unified')
 
     return (reading_order_type,
             x_start_returned,
             x_end_returned,
-            y_sep_returned,
-            y_diff_returned,
-            y_lines_without_mother,
+            y_mid_returned,
+            y_max_returned,
+            y_mid_without_mother,
             x_start_without_mother,
             x_end_without_mother,
             there_is_sep_with_child,
-            y_lines_with_child_without_mother,
+            y_mid_with_child_without_mother,
             x_start_with_child_without_mother,
             x_end_with_child_without_mother,
             new_main_sep_y)

From b2a79cc6ed766cef5074629fcb76ae1c6846f084 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 01:31:52 +0200
Subject: [PATCH 12/21] 
 `return_x_start_end_mothers_childs_and_type_of_reading_order`: fix+1

when calculating `reading_order_type`, upper limit on column range
(`x_end`) needs to be `+1` here as well
---
 src/eynollah/utils/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index f1a8aae..3a383e9 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -274,11 +274,11 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
         if len(remained_sep_indexes_without_mother)>=2:
             for i in range(len(remained_sep_indexes_without_mother)-1):
                 index_i = remained_sep_indexes_without_mother[i]
-                nodes_i = set(range(x_start[index_i], x_end[index_i])) #  + 1
+                nodes_i = set(range(x_start[index_i], x_end[index_i] + 1))
                 #print(index_i, nodes_i, "nodes_i without mother")
                 for j in range(i + 1, len(remained_sep_indexes_without_mother)):
                     index_j = remained_sep_indexes_without_mother[j]
-                    nodes_j = set(range(x_start[index_j], x_end[index_j])) #  + 1
+                    nodes_j = set(range(x_start[index_j], x_end[index_j] + 1))
                     #print(index_j, nodes_j, "nodes_j without mother")
                     if nodes_i - nodes_j != nodes_i:
                         #print("type=1")

From acee4c1bfe227055194050935f1868d1fb156701 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 01:43:41 +0200
Subject: [PATCH 13/21] `find_number_of_columns_in_document`: simplify

---
 src/eynollah/utils/__init__.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 3a383e9..f948de2 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1551,23 +1551,23 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
                                        (x_max_head>=.84*region_pre_p.shape[1])]
         cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head)
 
-    cy_seps_splitters = np.sort(cy_seps_splitters)
+    cy_seps_splitters = np.sort(cy_seps_splitters).astype(int)
     splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]]
-    splitter_y_new_diff = np.diff(splitter_y_new) / float(region_pre_p.shape[0]) * 100
-
-    args_big_parts=np.arange(len(splitter_y_new_diff))[ splitter_y_new_diff>22 ]
+    big_part = 22 * region_pre_p.shape[0] // 100 # percent height
 
     regions_without_separators=return_regions_without_separators(region_pre_p)
-    length_y_threshold=regions_without_separators.shape[0]/4.0
 
     num_col_fin=0
     peaks_neg_fin_fin=[]
-    for itiles in args_big_parts:
-        regions_without_separators_tile=regions_without_separators[int(splitter_y_new[itiles]):
-                                                                   int(splitter_y_new[itiles+1]),:]
+    num_big_parts = 0
+    for top, bot in pairwise(splitter_y_new):
+        if bot - top < big_part:
+            continue
+        num_big_parts += 1
         try:
-            num_col, peaks_neg_fin = find_num_col(regions_without_separators_tile,
+            num_col, peaks_neg_fin = find_num_col(regions_without_separators[top: bot],
                                                   num_col_classifier, tables, multiplier=7.0)
+            #print("big part %d:%d has %d columns" % (top, bot, num_col), peaks_neg_fin)
         except:
             num_col = 0
             peaks_neg_fin = []
@@ -1575,7 +1575,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
             num_col_fin=num_col
             peaks_neg_fin_fin=peaks_neg_fin
 
-    if len(args_big_parts)==1 and (len(peaks_neg_fin_fin)+1)<num_col_classifier:
+    if num_big_parts == 1 and len(peaks_neg_fin_fin) + 1 < num_col_classifier:
         peaks_neg_fin=find_num_col_by_vertical_lines(vertical)
         peaks_neg_fin=peaks_neg_fin[peaks_neg_fin>=500]
         peaks_neg_fin=peaks_neg_fin[peaks_neg_fin<=(vertical.shape[1]-500)]

From 5d15941b350841a4490e002c92ff89a5f6113905 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 01:51:59 +0200
Subject: [PATCH 14/21] `contours_in_same_horizon`: simplify

- array instead of list operations
- return array of index pairs instead of list objects
---
 src/eynollah/utils/__init__.py | 77 ++++++++++++++++------------------
 src/eynollah/utils/contour.py  | 25 +++++------
 2 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index f948de2..10987ad 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1315,47 +1315,42 @@ def combine_hor_lines_and_delete_cross_points_and_get_lines_features_back_new(
                                                                             float(num_col_classifier))
     if len_lines_bigger_than_x_width_smaller_than_acolumn_width_per_column < 10:
         args_hor=np.arange(len(slope_lines_hor))
-        all_args_uniq=contours_in_same_horizon(cy_main_hor)
-        #print(all_args_uniq,'all_args_uniq')
-        if len(all_args_uniq)>0:
-            if type(all_args_uniq[0]) is list:
-                special_separators=[]
-                contours_new=[]
-                for dd in range(len(all_args_uniq)):
-                    merged_all=None
-                    some_args=args_hor[all_args_uniq[dd]]
-                    some_cy=cy_main_hor[all_args_uniq[dd]]
-                    some_x_min=x_min_main_hor[all_args_uniq[dd]]
-                    some_x_max=x_max_main_hor[all_args_uniq[dd]]
-
-                    #img_in=np.zeros(separators_closeup_n[:,:,2].shape)
-                    #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff')
-                    diff_x_some=some_x_max-some_x_min
-                    for jv in range(len(some_args)):
-                        img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1))
-                        if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some):
-                            img_p_in[int(np.mean(some_cy))-5:
-                                     int(np.mean(some_cy))+5,
-                                     int(np.min(some_x_min)):
-                                     int(np.max(some_x_max)) ]=1
-                    sum_dis=dist_x_hor[some_args].sum()
-                    diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args])
-
-                    if (diff_max_min_uniques > sum_dis and
-                        sum_dis / float(diff_max_min_uniques) > 0.85 and
-                        diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and
-                        np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])):
-                        # print(dist_x_hor[some_args],
-                        #       dist_x_hor[some_args].sum(),
-                        #       np.min(x_min_main_hor[some_args]),
-                        #       np.max(x_max_main_hor[some_args]),'jalibdi')
-                        # print(np.mean( dist_x_hor[some_args] ),
-                        #       np.std( dist_x_hor[some_args] ),
-                        #       np.var( dist_x_hor[some_args] ),'jalibdiha')
-                        special_separators.append(np.mean(cy_main_hor[some_args]))
-            else:
-                img_p_in=img_in_hor
-                special_separators=[]
+        sep_pairs=contours_in_same_horizon(cy_main_hor)
+        if len(sep_pairs):
+            special_separators=[]
+            contours_new=[]
+            for pair in sep_pairs:
+                merged_all=None
+                some_args=args_hor[pair]
+                some_cy=cy_main_hor[pair]
+                some_x_min=x_min_main_hor[pair]
+                some_x_max=x_max_main_hor[pair]
+
+                #img_in=np.zeros(separators_closeup_n[:,:,2].shape)
+                #print(img_p_in_ver.shape[1],some_x_max-some_x_min,'xdiff')
+                diff_x_some=some_x_max-some_x_min
+                for jv in range(len(some_args)):
+                    img_p_in=cv2.fillPoly(img_in_hor, pts=[contours_lines_hor[some_args[jv]]], color=(1,1,1))
+                    if any(i_diff>(img_p_in_ver.shape[1]/float(3.3)) for i_diff in diff_x_some):
+                        img_p_in[int(np.mean(some_cy))-5:
+                                 int(np.mean(some_cy))+5,
+                                 int(np.min(some_x_min)):
+                                 int(np.max(some_x_max)) ]=1
+                sum_dis=dist_x_hor[some_args].sum()
+                diff_max_min_uniques=np.max(x_max_main_hor[some_args])-np.min(x_min_main_hor[some_args])
+
+                if (diff_max_min_uniques > sum_dis and
+                    sum_dis / float(diff_max_min_uniques) > 0.85 and
+                    diff_max_min_uniques / float(img_p_in_ver.shape[1]) > 0.85 and
+                    np.std(dist_x_hor[some_args]) < 0.55 * np.mean(dist_x_hor[some_args])):
+                    # print(dist_x_hor[some_args],
+                    #       dist_x_hor[some_args].sum(),
+                    #       np.min(x_min_main_hor[some_args]),
+                    #       np.max(x_max_main_hor[some_args]),'jalibdi')
+                    # print(np.mean( dist_x_hor[some_args] ),
+                    #       np.std( dist_x_hor[some_args] ),
+                    #       np.var( dist_x_hor[some_args] ),'jalibdiha')
+                    special_separators.append(np.mean(cy_main_hor[some_args]))
         else:
             img_p_in=img_in_hor
             special_separators=[]
diff --git a/src/eynollah/utils/contour.py b/src/eynollah/utils/contour.py
index f304db2..052688c 100644
--- a/src/eynollah/utils/contour.py
+++ b/src/eynollah/utils/contour.py
@@ -14,21 +14,16 @@
 from .rotate import rotate_image, rotation_image_new
 
 def contours_in_same_horizon(cy_main_hor):
-    X1 = np.zeros((len(cy_main_hor), len(cy_main_hor)))
-    X2 = np.zeros((len(cy_main_hor), len(cy_main_hor)))
-
-    X1[0::1, :] = cy_main_hor[:]
-    X2 = X1.T
-
-    X_dif = np.abs(X2 - X1)
-    args_help = np.array(range(len(cy_main_hor)))
-    all_args = []
-    for i in range(len(cy_main_hor)):
-        list_h = list(args_help[X_dif[i, :] <= 20])
-        list_h.append(i)
-        if len(list_h) > 1:
-            all_args.append(list(set(list_h)))
-    return np.unique(np.array(all_args, dtype=object))
+    """
+    Takes an array of y coords, identifies all pairs among them
+    which are close to each other, and returns all such pairs
+    by index into the array.
+    """
+    sort = np.argsort(cy_main_hor)
+    same = np.diff(cy_main_hor[sort] <= 20)
+    # groups = np.split(sort, np.arange(len(cy_main_hor) - 1)[~same] + 1)
+    same = np.flatnonzero(same)
+    return np.stack((sort[:-1][same], sort[1:][same])).T
 
 def find_contours_mean_y_diff(contours_main):
     M_main = [cv2.moments(contours_main[j]) for j in range(len(contours_main))]

From 6cc5900943d5395adbbbea737871413bf10b9ccf Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 01:55:07 +0200
Subject: [PATCH 15/21] `find_num_col`: add better plotting (but commented out)

---
 src/eynollah/utils/__init__.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 10987ad..4046396 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -485,9 +485,12 @@ def find_num_col(regions_without_separators, num_col_classifier, tables, multipl
     # print(peaks_neg,'peaks_neg')
     # fig, (ax1, ax2) = plt.subplots(2, sharex=True)
     # ax1.imshow(regions_without_separators, aspect="auto")
-    # ax2.plot(z)
-    # ax2.scatter(peaks_neg, z[peaks_neg])
-    # ax2.axhline(grenze, label="grenze")
+    # ax2.plot(z, color='red', label='z')
+    # ax2.plot(zneg[20:], color='blue', label='zneg')
+    # ax2.scatter(peaks_neg, z[peaks_neg], color='red')
+    # ax2.scatter(peaks_neg, zneg[20:][peaks_neg], color='blue')
+    # ax2.axhline(min_peaks_pos, color='red', label="min_peaks_pos")
+    # ax2.axhline(grenze, color='blue', label="grenze")
     # ax2.text(0, grenze, "grenze")
     # plt.show()
 
@@ -816,6 +819,12 @@ def find_num_col_by_vertical_lines(regions_without_separators, multiplier=3.8):
     peaks, _ = find_peaks(z, height=0)
 
     # print(peaks,'peaksnew')
+    # fig, (ax1, ax2) = plt.subplots(2, sharex=True, suptitle='find_num_col_by_vertical_lines')
+    # ax1.imshow(regions_without_separators, aspect="auto")
+    # ax2.plot(z)
+    # ax2.scatter(peaks, z[peaks])
+    # ax2.set_title('find_peaks(regions_without_separators.sum(axis=0), height=0)')
+    # plt.show()
     return peaks
 
 def return_regions_without_separators(regions_pre):

From 6fbb5f8a12185192f7d9db7b008c3ef8b5f24d33 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 02:02:39 +0200
Subject: [PATCH 16/21] `return_boxes_of_images_by_order_of_reading_new`:
 simplify
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- array instead of list operations
- add better plotting (but commented out)
- add more debug printing (but commented out)
- add more inline comments for documentation
- rename identifiers to make more readable:
  - `cy_hor_diff` → `y_max_hor_some` (because the ymax gets passed)
  - `lines` → `seps`
  - `y_type_2` → `y_mid`
  - `y_diff_type_2` → `y_max`
  - `y_lines_by_order` → `y_mid_by_order`
  - `y_lines_without_mother` → `y_mid_without_mother`
  - `y_lines_with_child_without_mother` → `y_mid_with_child_without_mother`
  - `y_column` → `y_mid_column`
  - `y_column_nc` → `y_mid_column_nc`
  - `y_all_between_nm_wc` → `y_mid_between_nm_wc`
  - `lines_so_close_to_top_separator` → `seps_too_close_to_top_separator`
  - `y_in_cols` and `y_down` → `y_mid_next`
- use `pairwise()` `nc_top:nc_bot` instead of `i_c` indexing
---
 src/eynollah/utils/__init__.py | 486 +++++++++++++++++----------------
 1 file changed, 250 insertions(+), 236 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 4046396..eca96f3 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1599,19 +1599,31 @@ def return_boxes_of_images_by_order_of_reading_new(
     if logger is None:
         logger = getLogger(__package__)
     logger.debug('enter return_boxes_of_images_by_order_of_reading_new')
-    # def dbg_plt(box=None, title=None):
-    #     if box is None:
-    #         box = [None, None, None, None]
-    #     img = regions_without_separators[box[2]:box[3], box[0]:box[1]]
+
+    # def dbg_plt(box=None, title=None, rectangles=None, rectangles_showidx=False):
+    #     minx, maxx, miny, maxy = box or (0, None, 0, None)
+    #     img = regions_without_separators[miny:maxy, minx:maxx]
     #     plt.imshow(img)
     #     xrange = np.arange(0, img.shape[1], 100)
     #     yrange = np.arange(0, img.shape[0], 100)
-    #     plt.gca().set_xticks(xrange, xrange + (box[0] or 0))
-    #     plt.gca().set_yticks(yrange, yrange + (box[2] or 0))
+    #     ax = plt.gca()
+    #     ax.set_xticks(xrange)
+    #     ax.set_yticks(yrange)
+    #     ax.set_xticklabels(xrange + minx)
+    #     ax.set_yticklabels(yrange + miny)
+    #     def format_coord(x, y):
+    #         return 'x={:g}, y={:g}'.format(x + minx, y + miny)
+    #     ax.format_coord = format_coord
     #     if title:
     #         plt.title(title)
+    #     if rectangles:
+    #         for i, (xmin, xmax, ymin, ymax) in enumerate(rectangles):
+    #             ax.add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
+    #                                            fill=False, linewidth=1, edgecolor='r'))
+    #             if rectangles_showidx:
+    #                 ax.text((xmin+xmax)/2, (ymin+ymax)/2, str(i + 1), c='r')
     #     plt.show()
-    # dbg_plt()
+    # dbg_plt(title="return_boxes_of_images_by_order_of_reading_new")
 
     boxes=[]
     peaks_neg_tot_tables = []
@@ -1619,9 +1631,7 @@ def return_boxes_of_images_by_order_of_reading_new(
     width_tot = regions_without_separators.shape[1]
     for top, bot in pairwise(splitter_y_new):
         # print("%d:%d" % (top, bot), 'i')
-        # dbg_plt([None, None, top, bot],
-        #         "image cut for y split %d:%d" % (
-        #             top, bot))
+        # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot))
         matrix_new = matrix_of_lines_ch[(matrix_of_lines_ch[:,6] > top) &
                                         (matrix_of_lines_ch[:,7] < bot)]
         #print(len( matrix_new[:,9][matrix_new[:,9]==1] ))
@@ -1677,20 +1687,21 @@ def return_boxes_of_images_by_order_of_reading_new(
                             peaks_neg_fin = peaks_neg_fin1
                         else:
                             peaks_neg_fin = peaks_neg_fin2
+                        # add offset to local result
                         peaks_neg_fin = list(np.array(peaks_neg_fin) + left)
                         #print(peaks_neg_fin,'peaks_neg_fin')
 
+                        peaks_neg_fin_rev.extend(peaks_neg_fin)
                         if right < peaks_neg_fin_early[-1]:
+                            # all but the last column: interject the preexisting boundary
                             peaks_neg_fin_rev.append(right)
-                        peaks_neg_fin_rev.extend(peaks_neg_fin)
+                        #print(peaks_neg_fin_rev,'peaks_neg_fin_rev')
 
-                    if len(peaks_neg_fin_rev)>=len(peaks_neg_fin_org):
-                        peaks_neg_fin=list(np.sort(peaks_neg_fin_rev))
-                        num_col=len(peaks_neg_fin)
+                    if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org):
+                        peaks_neg_fin = peaks_neg_fin_rev
                     else:
-                        peaks_neg_fin=list(np.copy(peaks_neg_fin_org))
-                        num_col=len(peaks_neg_fin)
-
+                        peaks_neg_fin = peaks_neg_fin_org
+                    num_col = len(peaks_neg_fin)
                     #print(peaks_neg_fin,'peaks_neg_fin')
             except:
                 logger.exception("cannot find peaks consistent with columns")
@@ -1700,7 +1711,7 @@ def return_boxes_of_images_by_order_of_reading_new(
             x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ]
             x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ]
             cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ]
-            cy_hor_diff=matrix_new[:,7][ (matrix_new[:,9]==0) ]
+            y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ]
 
             if right2left_readingorder:
                 x_max_hor_some_new = width_tot - x_min_hor_some
@@ -1708,136 +1719,121 @@ def return_boxes_of_images_by_order_of_reading_new(
                 x_min_hor_some =list(np.copy(x_min_hor_some_new))
                 x_max_hor_some =list(np.copy(x_max_hor_some_new))
 
-            peaks_neg_tot = [0] + peaks_neg_fin + [width_tot]
+            peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot])
+            #print(peaks_neg_tot,'peaks_neg_tot')
             peaks_neg_tot_tables.append(peaks_neg_tot)
 
-            reading_order_type, x_starting, x_ending, y_type_2, y_diff_type_2, \
-                y_lines_without_mother, x_start_without_mother, x_end_without_mother, there_is_sep_with_child, \
-                y_lines_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \
+            all_columns = set(range(len(peaks_neg_tot) - 1))
+            #print("all_columns", all_columns)
+
+            reading_order_type, x_starting, x_ending, y_mid, y_max, \
+                y_mid_without_mother, x_start_without_mother, x_end_without_mother, \
+                there_is_sep_with_child, \
+                y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \
                 new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order(
-                    x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, cy_hor_diff)
+                    x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some)
+
+            # show multi-column separators
+            # dbg_plt([0, None, top, bot], "multi-column separators in current split", 
+            #         list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending],
+            #                  y_mid - top, y_max - top)), True)
 
-            all_columns = set(range(len(peaks_neg_tot) - 1))
-            # print("all_columns", all_columns)
             if (reading_order_type == 1 or
-                len(y_lines_without_mother) >= 2 or
+                len(y_mid_without_mother) >= 2 or
                 there_is_sep_with_child == 1):
+                # there are top-level multi-colspan horizontal separators which overlap each other
+                # or multiple top-level multi-colspan horizontal separators
+                # or multi-colspan horizontal separators shorter than their respective top-level:
+                # todo: explain how this is dealt with
                 try:
                     y_grenze = top + 300
-                    #check if there is a big separator in this y_mains_sep_ohne_grenzen
+                    up = (y_mid > top) & (y_mid <= y_grenze)
 
-                    args_early_ys=np.arange(len(y_type_2))
+                    args_early_ys=np.arange(len(y_mid))
                     #print(args_early_ys,'args_early_ys')
-                    #print(top, bot)
-
-                    x_starting_up = x_starting[(y_type_2 > top) &
-                                               (y_type_2 <= y_grenze)]
-                    x_ending_up = x_ending[(y_type_2 > top) &
-                                           (y_type_2 <= y_grenze)]
-                    y_type_2_up = y_type_2[(y_type_2 > top) &
-                                           (y_type_2 <= y_grenze)]
-                    y_diff_type_2_up = y_diff_type_2[(y_type_2 > top) &
-                                                     (y_type_2 <= y_grenze)]
-                    args_up = args_early_ys[(y_type_2 > top) &
-                                            (y_type_2 <= y_grenze)]
-                    if len(y_type_2_up) > 0:
-                        y_main_separator_up = y_type_2_up [(x_starting_up==0) &
-                                                           (x_ending_up==(len(peaks_neg_tot)-1) )]
-                        y_diff_main_separator_up = y_diff_type_2_up[(x_starting_up==0) &
-                                                                    (x_ending_up==(len(peaks_neg_tot)-1) )]
-                        args_main_to_deleted = args_up[(x_starting_up==0) &
-                                                       (x_ending_up==(len(peaks_neg_tot)-1) )]
-                        #print(y_main_separator_up,y_diff_main_separator_up,args_main_to_deleted,'fffffjammmm')
-                        if len(y_diff_main_separator_up) > 0:
+                    #print(y_mid,'y_mid')
+
+                    x_starting_up = x_starting[up]
+                    x_ending_up = x_ending[up]
+                    y_mid_up = y_mid[up]
+                    y_max_up = y_max[up]
+                    args_up = args_early_ys[up]
+                    #print(args_up,'args_up')
+                    #print(y_mid_up,'y_mid_up')
+                    #check if there is a big separator in this y_mains0
+                    if len(y_mid_up) > 0:
+                        # is there a separator with full-width span?
+                        main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1)
+                        y_mid_main_separator_up = y_mid_up[main_separator]
+                        y_max_main_separator_up = y_max_up[main_separator]
+                        args_main_to_deleted = args_up[main_separator]
+                        #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm')
+                        if len(y_max_main_separator_up):
                             args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) ))
                             #print(args_to_be_kept,'args_to_be_kept')
-                            boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
-                                          top, y_diff_main_separator_up.max()])
-                            # dbg_plt(boxes[-1], "first box")
-                            top = y_diff_main_separator_up.max()
+                            boxes.append([0, peaks_neg_tot[-1],
+                                          top, y_max_main_separator_up.max()])
+                            # dbg_plt(boxes[-1], "near top main separator box")
+                            top = y_max_main_separator_up.max()
 
                             #print(top,'top')
-                            y_type_2 = y_type_2[args_to_be_kept]
+                            y_mid = y_mid[args_to_be_kept]
                             x_starting = x_starting[args_to_be_kept]
                             x_ending = x_ending[args_to_be_kept]
-                            y_diff_type_2 = y_diff_type_2[args_to_be_kept]
+                            y_max = y_max[args_to_be_kept]
 
                             #print('galdiha')
                             y_grenze = top + 200
-                            args_early_ys2=np.arange(len(y_type_2))
-                            y_type_2_up=y_type_2[(y_type_2 > top) &
-                                                 (y_type_2 <= y_grenze)]
-                            x_starting_up=x_starting[(y_type_2 > top) &
-                                                     (y_type_2 <= y_grenze)]
-                            x_ending_up=x_ending[(y_type_2 > top) &
-                                                 (y_type_2 <= y_grenze)]
-                            y_diff_type_2_up=y_diff_type_2[(y_type_2 > top) &
-                                                           (y_type_2 <= y_grenze)]
-                            args_up2=args_early_ys2[(y_type_2 > top) &
-                                                    (y_type_2 <= y_grenze)]
-                            #print(y_type_2_up,x_starting_up,x_ending_up,'didid')
-                            nodes_in = set()
-                            for ij in range(len(x_starting_up)):
-                                nodes_in.update(range(x_starting_up[ij],
-                                                      x_ending_up[ij]))
-                            #print(nodes_in,'nodes_in')
-
-                            if nodes_in == set(range(len(peaks_neg_tot)-1)):
-                                pass
-                            elif nodes_in == set(range(1, len(peaks_neg_tot)-1)):
-                                pass
-                            else:
-                                #print('burdaydikh')
-                                args_to_be_kept2=np.array(list( set(args_early_ys2)-set(args_up2) ))
-
-                                if len(args_to_be_kept2)>0:
-                                    y_type_2 = y_type_2[args_to_be_kept2]
-                                    x_starting = x_starting[args_to_be_kept2]
-                                    x_ending = x_ending[args_to_be_kept2]
-                                    y_diff_type_2 = y_diff_type_2[args_to_be_kept2]
-                                else:
-                                    pass
-                                #print('burdaydikh2')
-                        elif len(y_diff_main_separator_up)==0:
-                            nodes_in = set()
-                            for ij in range(len(x_starting_up)):
-                                nodes_in.update(range(x_starting_up[ij],
-                                                      x_ending_up[ij]))
-                            #print(nodes_in,'nodes_in2')
-                            #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))')
-
-                            if nodes_in == set(range(len(peaks_neg_tot)-1)):
-                                pass
-                            elif nodes_in == set(range(1,len(peaks_neg_tot)-1)):
-                                pass
-                            else:
-                                #print('burdaydikh')
-                                #print(args_early_ys,'args_early_ys')
-                                #print(args_up,'args_up')
-                                args_to_be_kept2=np.array(list( set(args_early_ys) - set(args_up) ))
-
-                                #print(args_to_be_kept2,'args_to_be_kept2')
-                                #print(len(y_type_2),len(x_starting),len(x_ending),len(y_diff_type_2))
-                                if len(args_to_be_kept2)>0:
-                                    y_type_2 = y_type_2[args_to_be_kept2]
-                                    x_starting = x_starting[args_to_be_kept2]
-                                    x_ending = x_ending[args_to_be_kept2]
-                                    y_diff_type_2 = y_diff_type_2[args_to_be_kept2]
-                                else:
-                                    pass
-                                #print('burdaydikh2')
+                            up = (y_mid > top) & (y_mid <= y_grenze)
+                            args_early_ys2 = np.arange(len(y_mid))
+                            x_starting_up = x_starting[up]
+                            x_ending_up = x_ending[up]
+                            y_mid_up = y_mid[up]
+                            y_max_up = y_max[up]
+                            args_up2 = args_early_ys2[up]
+                            #print(y_mid_up,x_starting_up,x_ending_up,'didid')
+                        else:
+                            args_early_ys2 = args_early_ys
+                            args_up2 = args_up
+
+                        nodes_in = set()
+                        for ij in range(len(x_starting_up)):
+                            nodes_in.update(range(x_starting_up[ij],
+                                                  x_ending_up[ij]))
+                        #print(nodes_in,'nodes_in')
+                        #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))')
+
+                        if nodes_in == set(range(len(peaks_neg_tot)-1)):
+                            pass
+                        elif nodes_in == set(range(1, len(peaks_neg_tot)-1)):
+                            pass
+                        else:
+                            #print('burdaydikh')
+                            args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) ))
+
+                            if len(args_to_be_kept2):
+                                #print(args_to_be_kept2, "args_to_be_kept2")
+                                y_mid = y_mid[args_to_be_kept2]
+                                x_starting = x_starting[args_to_be_kept2]
+                                x_ending = x_ending[args_to_be_kept2]
+                                y_max = y_max[args_to_be_kept2]
 
                     #int(top)
-                    y_lines_by_order=[]
+                    # order multi-column separators
+                    y_mid_by_order=[]
                     x_start_by_order=[]
                     x_end_by_order=[]
                     if (reading_order_type == 1 or
                         len(x_end_with_child_without_mother) == 0):
                         if reading_order_type == 1:
-                            y_lines_by_order.append(top)
+                            # there are top-level multi-colspan horizontal separators which overlap each other
+                            #print("adding all columns at top because of multiple overlapping mothers")
+                            y_mid_by_order.append(top)
                             x_start_by_order.append(0)
                             x_end_by_order.append(len(peaks_neg_tot)-2)
                         else:
+                            # there are no top-level multi-colspan horizontal separators which themselves
+                            # contain shorter multi-colspan separators
                             #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
                             columns_covered_by_mothers = set()
                             for dj in range(len(x_start_without_mother)):
@@ -1845,31 +1841,32 @@ def return_boxes_of_images_by_order_of_reading_new(
                                     range(x_start_without_mother[dj],
                                           x_end_without_mother[dj]))
                             columns_not_covered = list(all_columns - columns_covered_by_mothers)
-                            y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) +
-                                                                   len(x_start_without_mother),
-                                                                   dtype=int) * top)
-                            ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
+                            #print(columns_covered_by_mothers, "columns_covered_by_mothers")
+                            #print(columns_not_covered, "columns_not_covered")
+                            y_mid = np.append(y_mid, np.ones(len(columns_not_covered) +
+                                                             len(x_start_without_mother),
+                                                             dtype=int) * top)
+                            ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                             ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                             x_starting = np.append(x_starting, np.array(columns_not_covered, int))
                             x_starting = np.append(x_starting, x_start_without_mother)
                             x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
                             x_ending = np.append(x_ending, x_end_without_mother)
 
-                        ind_args=np.arange(len(y_type_2))
-                        #ind_args=np.array(ind_args)
+                        ind_args=np.arange(len(y_mid))
                         #print(ind_args,'ind_args')
                         for column in range(len(peaks_neg_tot)-1):
                             #print(column,'column')
                             ind_args_in_col=ind_args[x_starting==column]
                             #print('babali2')
                             #print(ind_args_in_col,'ind_args_in_col')
-                            #print(len(y_type_2))
-                            y_column=y_type_2[ind_args_in_col]
+                            #print(len(y_mid))
+                            y_mid_column=y_mid[ind_args_in_col]
                             x_start_column=x_starting[ind_args_in_col]
                             x_end_column=x_ending[ind_args_in_col]
                             #print('babali3')
-                            ind_args_col_sorted=np.argsort(y_column)
-                            y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                            ind_args_col_sorted=np.argsort(y_mid_column)
+                            y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
                             x_start_by_order.extend(x_start_column[ind_args_col_sorted])
                             x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
                     else:
@@ -1880,93 +1877,113 @@ def return_boxes_of_images_by_order_of_reading_new(
                                 range(x_start_without_mother[dj],
                                       x_end_without_mother[dj]))
                         columns_not_covered = list(all_columns - columns_covered_by_mothers)
-                        y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + len(x_start_without_mother),
-                                                               dtype=int) * top)
-                        ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
+                        #print(columns_covered_by_mothers, "columns_covered_by_mothers")
+                        #print(columns_not_covered, "columns_not_covered")
+                        y_mid = np.append(y_mid, np.ones(len(columns_not_covered) +
+                                                         len(x_start_without_mother),
+                                                         dtype=int) * top)
+                        ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                         ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                         x_starting = np.append(x_starting, np.array(columns_not_covered, int))
                         x_starting = np.append(x_starting, x_start_without_mother)
                         x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
                         x_ending = np.append(x_ending, x_end_without_mother)
 
-                        columns_covered_by_with_child_no_mothers = set()
+                        columns_covered_by_mothers_with_child = set()
                         for dj in range(len(x_end_with_child_without_mother)):
-                            columns_covered_by_with_child_no_mothers.update(
+                            columns_covered_by_mothers_with_child.update(
                                 range(x_start_with_child_without_mother[dj],
                                       x_end_with_child_without_mother[dj]))
-                        columns_not_covered_child_no_mother = list(
-                            all_columns - columns_covered_by_with_child_no_mothers)
+                        #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child")
+                        columns_not_covered_by_mothers_with_child = list(
+                            all_columns - columns_covered_by_mothers_with_child)
                         #indexes_to_be_spanned=[]
                         for i_s in range(len(x_end_with_child_without_mother)):
-                            columns_not_covered_child_no_mother.append(x_start_with_child_without_mother[i_s])
-                        columns_not_covered_child_no_mother = np.sort(columns_not_covered_child_no_mother)
-                        ind_args = np.arange(len(y_type_2))
-                        x_end_with_child_without_mother = np.array(x_end_with_child_without_mother, int)
-                        x_start_with_child_without_mother = np.array(x_start_with_child_without_mother, int)
-                        for i_s_nc in columns_not_covered_child_no_mother:
+                            columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s])
+                        columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child)
+                        #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child")
+                        ind_args = np.arange(len(y_mid))
+                        for i_s_nc in columns_not_covered_by_mothers_with_child:
                             if i_s_nc in x_start_with_child_without_mother:
+                                # use only seps with mother's span ("biggest")
                                 #print("i_s_nc", i_s_nc)
                                 x_end_biggest_column = \
-                                    x_end_with_child_without_mother[x_start_with_child_without_mother==i_s_nc][0]
-                                args_all_biggest_lines = ind_args[(x_starting==i_s_nc) &
-                                                                  (x_ending==x_end_biggest_column)]
-                                y_column_nc = y_type_2[args_all_biggest_lines]
-                                #x_start_column_nc = x_starting[args_all_biggest_lines]
-                                #x_end_column_nc = x_ending[args_all_biggest_lines]
-                                y_column_nc = np.sort(y_column_nc)
-                                for i_c in range(len(y_column_nc)):
+                                    x_end_with_child_without_mother[
+                                        x_start_with_child_without_mother == i_s_nc][0]
+                                args_all_biggest_seps = \
+                                    ind_args[(x_starting == i_s_nc) &
+                                             (x_ending == x_end_biggest_column)]
+                                y_mid_column_nc = y_mid[args_all_biggest_seps]
+                                #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child")
+                                #x_start_column_nc = x_starting[args_all_biggest_seps]
+                                #x_end_column_nc = x_ending[args_all_biggest_seps]
+                                y_mid_column_nc = np.sort(y_mid_column_nc)
+                                #print(y_mid_column_nc, "y_mid_column_nc (sorted)")
+                                for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)):
                                     #print("i_c", i_c)
-                                    ind_all_lines_between_nm_wc = \
-                                        ind_args[(y_type_2 > y_column_nc[i_c]) &
-                                                 (y_type_2 < (y_column_nc[i_c+1]
-                                                              if i_c < len(y_column_nc)-1
-                                                              else bot)) &
+                                    #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc")
+                                    ind_all_seps_between_nm_wc = \
+                                        ind_args[(y_mid > nc_top) &
+                                                 (y_mid < nc_bot) &
                                                  (x_starting >= i_s_nc) &
                                                  (x_ending <= x_end_biggest_column)]
-                                    y_all_between_nm_wc = y_type_2[ind_all_lines_between_nm_wc]
-                                    x_starting_all_between_nm_wc = x_starting[ind_all_lines_between_nm_wc]
-                                    x_ending_all_between_nm_wc = x_ending[ind_all_lines_between_nm_wc]
+                                    y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc]
+                                    x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc]
+                                    x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc]
 
                                     columns_covered_by_mothers = set()
-                                    for dj in range(len(ind_all_lines_between_nm_wc)):
+                                    for dj in range(len(ind_all_seps_between_nm_wc)):
                                         columns_covered_by_mothers.update(
                                             range(x_starting_all_between_nm_wc[dj],
                                                   x_ending_all_between_nm_wc[dj]))
+                                    #print(columns_covered_by_mothers, "columns_covered_by_mothers")
                                     child_columns = set(range(i_s_nc, x_end_biggest_column))
                                     columns_not_covered = list(child_columns - columns_covered_by_mothers)
+                                    #print(child_columns, "child_columns")
+                                    #print(columns_not_covered, "columns_not_covered")
 
-                                    if len(ind_all_lines_between_nm_wc):
+                                    if len(ind_all_seps_between_nm_wc):
                                         biggest = np.argmax(x_ending_all_between_nm_wc -
                                                             x_starting_all_between_nm_wc)
+                                        #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc")
+                                        #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest],
+                                                                  x_ending_all_between_nm_wc[biggest]), "biggest")
                                         if columns_covered_by_mothers == set(
                                                 range(x_starting_all_between_nm_wc[biggest],
                                                       x_ending_all_between_nm_wc[biggest])):
-                                            # biggest accounts for all columns alone,
-                                            # longest line should be extended
-                                            lines_so_close_to_top_separator = \
-                                                ((y_all_between_nm_wc > y_column_nc[i_c]) &
-                                                 (y_all_between_nm_wc <= y_column_nc[i_c] + 500))
-                                            if (np.count_nonzero(lines_so_close_to_top_separator) and
-                                                np.count_nonzero(lines_so_close_to_top_separator) <
-                                                len(ind_all_lines_between_nm_wc)):
-                                                y_all_between_nm_wc = \
-                                                    y_all_between_nm_wc[~lines_so_close_to_top_separator]
+                                            # single biggest accounts for all covered columns alone,
+                                            # this separator should be extended to cover all
+                                            seps_too_close_to_top_separator = \
+                                                ((y_mid_all_between_nm_wc > nc_top) &
+                                                 (y_mid_all_between_nm_wc <= nc_top + 500))
+                                            if (np.count_nonzero(seps_too_close_to_top_separator) and
+                                                np.count_nonzero(seps_too_close_to_top_separator) <
+                                                len(ind_all_seps_between_nm_wc)):
+                                                #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator")
+                                                y_mid_all_between_nm_wc = \
+                                                    y_mid_all_between_nm_wc[~seps_too_close_to_top_separator]
                                                 x_starting_all_between_nm_wc = \
-                                                    x_starting_all_between_nm_wc[~lines_so_close_to_top_separator]
+                                                    x_starting_all_between_nm_wc[~seps_too_close_to_top_separator]
                                                 x_ending_all_between_nm_wc = \
-                                                    x_ending_all_between_nm_wc[~lines_so_close_to_top_separator]
-
-                                            y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c])
-                                            x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, i_s_nc)
-                                            x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_end_biggest_column)
+                                                    x_ending_all_between_nm_wc[~seps_too_close_to_top_separator]
+
+                                            y_mid_all_between_nm_wc = np.append(
+                                                y_mid_all_between_nm_wc, nc_top)
+                                            x_starting_all_between_nm_wc = np.append(
+                                                x_starting_all_between_nm_wc, i_s_nc)
+                                            x_ending_all_between_nm_wc = np.append(
+                                                x_ending_all_between_nm_wc, x_end_biggest_column)
                                         else:
-                                            y_all_between_nm_wc = np.append(y_all_between_nm_wc, y_column_nc[i_c])
-                                            x_starting_all_between_nm_wc = np.append(x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest])
-                                            x_ending_all_between_nm_wc = np.append(x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest])
+                                            y_mid_all_between_nm_wc = np.append(
+                                                y_mid_all_between_nm_wc, nc_top)
+                                            x_starting_all_between_nm_wc = np.append(
+                                                x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest])
+                                            x_ending_all_between_nm_wc = np.append(
+                                                x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest])
 
                                     if len(columns_not_covered):
-                                        y_all_between_nm_wc = np.append(
-                                            y_all_between_nm_wc, [y_column_nc[i_c]] * len(columns_not_covered))
+                                        y_mid_all_between_nm_wc = np.append(
+                                            y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered))
                                         x_starting_all_between_nm_wc = np.append(
                                             x_starting_all_between_nm_wc, np.array(columns_not_covered, int))
                                         x_ending_all_between_nm_wc = np.append(
@@ -1977,52 +1994,53 @@ def return_boxes_of_images_by_order_of_reading_new(
                                         ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column]
                                         #print('babali2')
                                         #print(ind_args_in_col,'ind_args_in_col')
-                                        #print(len(y_type_2))
-                                        y_column=y_all_between_nm_wc[ind_args_in_col]
+                                        #print(len(y_mid))
+                                        y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col]
                                         x_start_column=x_starting_all_between_nm_wc[ind_args_in_col]
                                         x_end_column=x_ending_all_between_nm_wc[ind_args_in_col]
                                         #print('babali3')
-                                        ind_args_col_sorted=np.argsort(y_column)
-                                        y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                                        ind_args_col_sorted=np.argsort(y_mid_column)
+                                        y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
                                         x_start_by_order.extend(x_start_column[ind_args_col_sorted])
                                         x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
                             else:
-                                #print(column,'column')
+                                #print(i_s_nc,'column not covered by mothers with child')
                                 ind_args_in_col=ind_args[x_starting==i_s_nc]
                                 #print('babali2')
                                 #print(ind_args_in_col,'ind_args_in_col')
-                                #print(len(y_type_2))
-                                y_column=y_type_2[ind_args_in_col]
+                                #print(len(y_mid))
+                                y_mid_column=y_mid[ind_args_in_col]
                                 x_start_column=x_starting[ind_args_in_col]
                                 x_end_column=x_ending[ind_args_in_col]
                                 #print('babali3')
-                                ind_args_col_sorted = np.argsort(y_column)
-                                y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                                ind_args_col_sorted = np.argsort(y_mid_column)
+                                y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
                                 x_start_by_order.extend(x_start_column[ind_args_col_sorted])
                                 x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
 
-                    y_lines_by_order = np.array(y_lines_by_order)
+                    # create single-column boxes from multi-column separators
+                    y_mid_by_order = np.array(y_mid_by_order)
                     x_start_by_order = np.array(x_start_by_order)
                     x_end_by_order = np.array(x_end_by_order)
-                    for il in range(len(y_lines_by_order)):
+                    for il in range(len(y_mid_by_order)):
                         #print(il, "il")
-                        y_itself = y_lines_by_order[il]
+                        y_mid_itself = y_mid_by_order[il]
                         x_start_itself = x_start_by_order[il]
                         x_end_itself = x_end_by_order[il]
                         for column in range(int(x_start_itself), int(x_end_itself)+1):
                             #print(column,'cols')
-                            y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) &
-                                                         (column >= x_start_by_order) &
-                                                         (column <= x_end_by_order)]
                             #print('burda')
-                            y_down = y_in_cols.min(initial=bot)
                             #print('burda2')
-                            #print(y_in_cols,'y_in_cols')
-                            #print(y_itself,'y_itself')
+                            y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
+                                                        (column >= x_start_by_order) &
+                                                        (column <= x_end_by_order)]
+                            y_mid_next = y_mid_next.min(initial=bot)
+                            #print(y_mid_next,'y_mid_next')
+                            #print(y_mid_itself,'y_mid_itself')
                             boxes.append([peaks_neg_tot[column],
                                           peaks_neg_tot[column+1],
-                                          y_itself,
-                                          y_down])
+                                          y_mid_itself,
+                                          y_mid_next])
                             # dbg_plt(boxes[-1], "A column %d box" % (column + 1))
                 except:
                     logger.exception("cannot assign boxes")
@@ -2030,20 +2048,21 @@ def return_boxes_of_images_by_order_of_reading_new(
                                   top, bot])
                     # dbg_plt(boxes[-1], "fallback box")
             else:
-                y_lines_by_order=[]
+                # order multi-column separators
+                y_mid_by_order=[]
                 x_start_by_order=[]
                 x_end_by_order=[]
                 if len(x_starting)>0:
-                    columns_covered_by_lines_covered_more_than_2col = set()
+                    columns_covered_by_seps_covered_more_than_2col = set()
                     for dj in range(len(x_starting)):
                         if set(range(x_starting[dj], x_ending[dj])) != all_columns:
-                            columns_covered_by_lines_covered_more_than_2col.update(
+                            columns_covered_by_seps_covered_more_than_2col.update(
                                 range(x_starting[dj], x_ending[dj]))
-                    columns_not_covered = list(all_columns - columns_covered_by_lines_covered_more_than_2col)
+                    columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col)
 
-                    y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered) + 1,
-                                                           dtype=int) * top)
-                    ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
+                    y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1,
+                                                     dtype=int) * top)
+                    ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                     ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                     x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
                     x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
@@ -2055,53 +2074,52 @@ def return_boxes_of_images_by_order_of_reading_new(
                         x_ending = np.append(x_ending, x_ending[0])
                 else:
                     columns_not_covered = list(all_columns)
-                    y_type_2 = np.append(y_type_2, np.ones(len(columns_not_covered),
-                                                           dtype=int) * top)
-                    ##y_lines_by_order = np.append(y_lines_by_order, [top] * len(columns_not_covered))
+                    y_mid = np.append(y_mid, np.ones(len(columns_not_covered),
+                                                     dtype=int) * top)
+                    ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                     ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
                     x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
                     x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
 
-                ind_args = np.arange(len(y_type_2))
-                
+                ind_args = np.arange(len(y_mid))
+
                 for column in range(len(peaks_neg_tot)-1):
                     #print(column,'column')
                     ind_args_in_col=ind_args[x_starting==column]
-                    #print(len(y_type_2))
-                    y_column=y_type_2[ind_args_in_col]
+                    #print(len(y_mid))
+                    y_mid_column=y_mid[ind_args_in_col]
                     x_start_column=x_starting[ind_args_in_col]
                     x_end_column=x_ending[ind_args_in_col]
 
-                    ind_args_col_sorted = np.argsort(y_column)
-                    y_lines_by_order.extend(y_column[ind_args_col_sorted])
+                    ind_args_col_sorted = np.argsort(y_mid_column)
+                    y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
                     x_start_by_order.extend(x_start_column[ind_args_col_sorted])
                     x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
 
-                y_lines_by_order = np.array(y_lines_by_order)
+                # create single-column boxes from multi-column separators
+                y_mid_by_order = np.array(y_mid_by_order)
                 x_start_by_order = np.array(x_start_by_order)
                 x_end_by_order = np.array(x_end_by_order)
-                for il in range(len(y_lines_by_order)):
+                for il in range(len(y_mid_by_order)):
                     #print(il, "il")
-                    y_itself = y_lines_by_order[il]
-                    #print(y_itself,'y_itself')
+                    y_mid_itself = y_mid_by_order[il]
+                    #print(y_mid_itself,'y_mid_itself')
                     x_start_itself = x_start_by_order[il]
                     x_end_itself = x_end_by_order[il]
                     for column in range(x_start_itself, x_end_itself+1):
                         #print(column,'cols')
-                        y_in_cols = y_lines_by_order[(y_itself < y_lines_by_order) &
-                                                     (column >= x_start_by_order) &
-                                                     (column <= x_end_by_order)]
                         #print('burda2')
-                        #print(y_in_cols,'y_in_cols')
-                        y_down = y_in_cols.min(initial=bot)
-                        #print(y_down,'y_down')
+                        y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
+                                                    (column >= x_start_by_order) &
+                                                    (column <= x_end_by_order)]
+                        #print(y_mid_next,'y_mid_next')
+                        y_mid_next = y_mid_next.min(initial=bot)
+                        #print(y_mid_next,'y_mid_next')
                         boxes.append([peaks_neg_tot[column],
                                       peaks_neg_tot[column+1],
-                                      y_itself,
-                                      y_down])
+                                      y_mid_itself,
+                                      y_mid_next])
                         # dbg_plt(boxes[-1], "B column %d box" % (column + 1))
-        #else:
-            #boxes.append([ 0, regions_without_separators[:,:].shape[1] ,top, bot])
 
     if right2left_readingorder:
         peaks_neg_tot_tables_new = []
@@ -2119,11 +2137,7 @@ def return_boxes_of_images_by_order_of_reading_new(
         peaks_neg_tot_tables = peaks_neg_tot_tables_new
 
     # show final xy-cut
-    # plt.imshow(regions_without_separators)
-    # for xmin, xmax, ymin, ymax in boxes:
-    #     plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
-    #                                           fill=False, linewidth=1, edgecolor='r'))
-    # plt.show()
+    # dbg_plt(None, "final XY-Cut", boxes, True)
 
     logger.debug('exit return_boxes_of_images_by_order_of_reading_new')
     return boxes, peaks_neg_tot_tables

From 66a0e55e49e4224e38c9792d06d2468c7fe8fe90 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 02:15:13 +0200
Subject: [PATCH 17/21] `return_boxes_of_images_by_order_of_reading_new`: avoid
 oversplits

when y slice (`top:bot`) is not a significant part of the page,
viz. less than 22% (as in `find_number_of_columns_in_document`),
avoid forcing `find_num_col` to reach `num_col_classifier`

(allows large headers not to be split up and thus better ordered)
---
 src/eynollah/utils/__init__.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index eca96f3..2017cea 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1628,7 +1628,8 @@ def return_boxes_of_images_by_order_of_reading_new(
     boxes=[]
     peaks_neg_tot_tables = []
     splitter_y_new = np.array(splitter_y_new, dtype=int)
-    width_tot = regions_without_separators.shape[1]
+    height_tot, width_tot = regions_without_separators.shape
+    big_part = 22 * height_tot // 100 # percent height
     for top, bot in pairwise(splitter_y_new):
         # print("%d:%d" % (top, bot), 'i')
         # dbg_plt([0, None, top, bot], "image cut for y split %d:%d" % (top, bot))
@@ -1644,12 +1645,17 @@ def return_boxes_of_images_by_order_of_reading_new(
             try:
                 num_col, peaks_neg_fin = find_num_col(
                     regions_without_separators[top:bot],
-                    num_col_classifier, tables, multiplier=6. if erosion_hurts else 7.)
+                    # we do not expect to get all columns in small parts (headings etc.):
+                    num_col_classifier if bot - top >= big_part else 1,
+                    tables, multiplier=6. if erosion_hurts else 7.)
             except:
                 peaks_neg_fin=[]
                 num_col = 0
             try:
-                if (len(peaks_neg_fin)+1)<num_col_classifier or num_col_classifier==6:
+                if ((len(peaks_neg_fin) + 1 < num_col_classifier or
+                    num_col_classifier == 6) and
+                    # we do not expect to get all columns in small parts (headings etc.):
+                    bot - top >= big_part):
                     # found too few columns here
                     #print('burda')
                     peaks_neg_fin_org = np.copy(peaks_neg_fin)

From 3ebbc2d693ae14a640c3cb478b6a01cd1e42efb7 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 02:30:39 +0200
Subject: [PATCH 18/21] `return_boxes_of_images_by_order_of_reading_new`:
 indent

(by removing unnecessary conditional)
---
 src/eynollah/utils/__init__.py | 873 ++++++++++++++++-----------------
 1 file changed, 436 insertions(+), 437 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index 2017cea..f30d55e 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1641,241 +1641,204 @@ def return_boxes_of_images_by_order_of_reading_new(
         #if (len(matrix_new[:,9][matrix_new[:,9]==1]) > 0 and
         #    np.max(matrix_new[:,8][matrix_new[:,9]==1]) >=
         #    0.1 * (np.abs(bot-top))):
-        if True:
-            try:
-                num_col, peaks_neg_fin = find_num_col(
-                    regions_without_separators[top:bot],
-                    # we do not expect to get all columns in small parts (headings etc.):
-                    num_col_classifier if bot - top >= big_part else 1,
-                    tables, multiplier=6. if erosion_hurts else 7.)
-            except:
-                peaks_neg_fin=[]
-                num_col = 0
-            try:
-                if ((len(peaks_neg_fin) + 1 < num_col_classifier or
-                    num_col_classifier == 6) and
-                    # we do not expect to get all columns in small parts (headings etc.):
-                    bot - top >= big_part):
-                    # found too few columns here
-                    #print('burda')
-                    peaks_neg_fin_org = np.copy(peaks_neg_fin)
-                    #print("peaks_neg_fin_org", peaks_neg_fin_org)
-                    if len(peaks_neg_fin)==0:
-                        num_col, peaks_neg_fin = find_num_col(
-                            regions_without_separators[top:bot],
-                            num_col_classifier, tables, multiplier=3.)
-                    #print(peaks_neg_fin,'peaks_neg_fin')
-                    peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1]
-
-                    #print(peaks_neg_fin_early,'burda2')
-                    peaks_neg_fin_rev=[]
-                    for left, right in pairwise(peaks_neg_fin_early):
-                        # print("%d:%d" % (left, right), 'i_n')
-                        # dbg_plt([left, right, top, bot],
-                        #         "image cut for y split %d:%d / x gap %d:%d" % (
-                        #             top, bot, left, right))
-                        # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0))
-                        # plt.title("vertical projection (sum over y)")
-                        # plt.show()
-                        try:
-                            _, peaks_neg_fin1 = find_num_col(
-                                regions_without_separators[top:bot, left:right],
-                                num_col_classifier, tables, multiplier=7.)
-                        except:
-                            peaks_neg_fin1 = []
-                        try:
-                            _, peaks_neg_fin2 = find_num_col(
-                                regions_without_separators[top:bot, left:right],
-                                num_col_classifier, tables, multiplier=5.)
-                        except:
-                            peaks_neg_fin2 = []
-                        if len(peaks_neg_fin1) >= len(peaks_neg_fin2):
-                            peaks_neg_fin = peaks_neg_fin1
-                        else:
-                            peaks_neg_fin = peaks_neg_fin2
-                        # add offset to local result
-                        peaks_neg_fin = list(np.array(peaks_neg_fin) + left)
-                        #print(peaks_neg_fin,'peaks_neg_fin')
-
-                        peaks_neg_fin_rev.extend(peaks_neg_fin)
-                        if right < peaks_neg_fin_early[-1]:
-                            # all but the last column: interject the preexisting boundary
-                            peaks_neg_fin_rev.append(right)
-                        #print(peaks_neg_fin_rev,'peaks_neg_fin_rev')
-
-                    if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org):
-                        peaks_neg_fin = peaks_neg_fin_rev
+        try:
+            num_col, peaks_neg_fin = find_num_col(
+                regions_without_separators[top:bot],
+                # we do not expect to get all columns in small parts (headings etc.):
+                num_col_classifier if bot - top >= big_part else 1,
+                tables, multiplier=6. if erosion_hurts else 7.)
+        except:
+            peaks_neg_fin=[]
+            num_col = 0
+        try:
+            if ((len(peaks_neg_fin) + 1 < num_col_classifier or
+                num_col_classifier == 6) and
+                # we do not expect to get all columns in small parts (headings etc.):
+                bot - top >= big_part):
+                # found too few columns here
+                #print('burda')
+                peaks_neg_fin_org = np.copy(peaks_neg_fin)
+                #print("peaks_neg_fin_org", peaks_neg_fin_org)
+                if len(peaks_neg_fin)==0:
+                    num_col, peaks_neg_fin = find_num_col(
+                        regions_without_separators[top:bot],
+                        num_col_classifier, tables, multiplier=3.)
+                #print(peaks_neg_fin,'peaks_neg_fin')
+                peaks_neg_fin_early = [0] + peaks_neg_fin + [width_tot-1]
+
+                #print(peaks_neg_fin_early,'burda2')
+                peaks_neg_fin_rev=[]
+                for left, right in pairwise(peaks_neg_fin_early):
+                    # print("%d:%d" % (left, right), 'i_n')
+                    # dbg_plt([left, right, top, bot],
+                    #         "image cut for y split %d:%d / x gap %d:%d" % (
+                    #             top, bot, left, right))
+                    # plt.plot(regions_without_separators[top:bot, left:right].sum(axis=0))
+                    # plt.title("vertical projection (sum over y)")
+                    # plt.show()
+                    try:
+                        _, peaks_neg_fin1 = find_num_col(
+                            regions_without_separators[top:bot, left:right],
+                            num_col_classifier, tables, multiplier=7.)
+                    except:
+                        peaks_neg_fin1 = []
+                    try:
+                        _, peaks_neg_fin2 = find_num_col(
+                            regions_without_separators[top:bot, left:right],
+                            num_col_classifier, tables, multiplier=5.)
+                    except:
+                        peaks_neg_fin2 = []
+                    if len(peaks_neg_fin1) >= len(peaks_neg_fin2):
+                        peaks_neg_fin = peaks_neg_fin1
                     else:
-                        peaks_neg_fin = peaks_neg_fin_org
-                    num_col = len(peaks_neg_fin)
+                        peaks_neg_fin = peaks_neg_fin2
+                    # add offset to local result
+                    peaks_neg_fin = list(np.array(peaks_neg_fin) + left)
                     #print(peaks_neg_fin,'peaks_neg_fin')
-            except:
-                logger.exception("cannot find peaks consistent with columns")
-            #num_col, peaks_neg_fin = find_num_col(
-            #    regions_without_separators[top:bot,:],
-            #    multiplier=7.0)
-            x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ]
-            x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ]
-            cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ]
-            y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ]
-
-            if right2left_readingorder:
-                x_max_hor_some_new = width_tot - x_min_hor_some
-                x_min_hor_some_new = width_tot - x_max_hor_some
-                x_min_hor_some =list(np.copy(x_min_hor_some_new))
-                x_max_hor_some =list(np.copy(x_max_hor_some_new))
-
-            peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot])
-            #print(peaks_neg_tot,'peaks_neg_tot')
-            peaks_neg_tot_tables.append(peaks_neg_tot)
-
-            all_columns = set(range(len(peaks_neg_tot) - 1))
-            #print("all_columns", all_columns)
-
-            reading_order_type, x_starting, x_ending, y_mid, y_max, \
-                y_mid_without_mother, x_start_without_mother, x_end_without_mother, \
-                there_is_sep_with_child, \
-                y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \
-                new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order(
-                    x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some)
-
-            # show multi-column separators
-            # dbg_plt([0, None, top, bot], "multi-column separators in current split", 
-            #         list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending],
-            #                  y_mid - top, y_max - top)), True)
-
-            if (reading_order_type == 1 or
-                len(y_mid_without_mother) >= 2 or
-                there_is_sep_with_child == 1):
-                # there are top-level multi-colspan horizontal separators which overlap each other
-                # or multiple top-level multi-colspan horizontal separators
-                # or multi-colspan horizontal separators shorter than their respective top-level:
-                # todo: explain how this is dealt with
-                try:
-                    y_grenze = top + 300
-                    up = (y_mid > top) & (y_mid <= y_grenze)
-
-                    args_early_ys=np.arange(len(y_mid))
-                    #print(args_early_ys,'args_early_ys')
-                    #print(y_mid,'y_mid')
-
-                    x_starting_up = x_starting[up]
-                    x_ending_up = x_ending[up]
-                    y_mid_up = y_mid[up]
-                    y_max_up = y_max[up]
-                    args_up = args_early_ys[up]
-                    #print(args_up,'args_up')
-                    #print(y_mid_up,'y_mid_up')
-                    #check if there is a big separator in this y_mains0
-                    if len(y_mid_up) > 0:
-                        # is there a separator with full-width span?
-                        main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1)
-                        y_mid_main_separator_up = y_mid_up[main_separator]
-                        y_max_main_separator_up = y_max_up[main_separator]
-                        args_main_to_deleted = args_up[main_separator]
-                        #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm')
-                        if len(y_max_main_separator_up):
-                            args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) ))
-                            #print(args_to_be_kept,'args_to_be_kept')
-                            boxes.append([0, peaks_neg_tot[-1],
-                                          top, y_max_main_separator_up.max()])
-                            # dbg_plt(boxes[-1], "near top main separator box")
-                            top = y_max_main_separator_up.max()
-
-                            #print(top,'top')
-                            y_mid = y_mid[args_to_be_kept]
-                            x_starting = x_starting[args_to_be_kept]
-                            x_ending = x_ending[args_to_be_kept]
-                            y_max = y_max[args_to_be_kept]
-
-                            #print('galdiha')
-                            y_grenze = top + 200
-                            up = (y_mid > top) & (y_mid <= y_grenze)
-                            args_early_ys2 = np.arange(len(y_mid))
-                            x_starting_up = x_starting[up]
-                            x_ending_up = x_ending[up]
-                            y_mid_up = y_mid[up]
-                            y_max_up = y_max[up]
-                            args_up2 = args_early_ys2[up]
-                            #print(y_mid_up,x_starting_up,x_ending_up,'didid')
-                        else:
-                            args_early_ys2 = args_early_ys
-                            args_up2 = args_up
-
-                        nodes_in = set()
-                        for ij in range(len(x_starting_up)):
-                            nodes_in.update(range(x_starting_up[ij],
-                                                  x_ending_up[ij]))
-                        #print(nodes_in,'nodes_in')
-                        #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))')
-
-                        if nodes_in == set(range(len(peaks_neg_tot)-1)):
-                            pass
-                        elif nodes_in == set(range(1, len(peaks_neg_tot)-1)):
-                            pass
-                        else:
-                            #print('burdaydikh')
-                            args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) ))
-
-                            if len(args_to_be_kept2):
-                                #print(args_to_be_kept2, "args_to_be_kept2")
-                                y_mid = y_mid[args_to_be_kept2]
-                                x_starting = x_starting[args_to_be_kept2]
-                                x_ending = x_ending[args_to_be_kept2]
-                                y_max = y_max[args_to_be_kept2]
-
-                    #int(top)
-                    # order multi-column separators
-                    y_mid_by_order=[]
-                    x_start_by_order=[]
-                    x_end_by_order=[]
-                    if (reading_order_type == 1 or
-                        len(x_end_with_child_without_mother) == 0):
-                        if reading_order_type == 1:
-                            # there are top-level multi-colspan horizontal separators which overlap each other
-                            #print("adding all columns at top because of multiple overlapping mothers")
-                            y_mid_by_order.append(top)
-                            x_start_by_order.append(0)
-                            x_end_by_order.append(len(peaks_neg_tot)-2)
-                        else:
-                            # there are no top-level multi-colspan horizontal separators which themselves
-                            # contain shorter multi-colspan separators
-                            #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
-                            columns_covered_by_mothers = set()
-                            for dj in range(len(x_start_without_mother)):
-                                columns_covered_by_mothers.update(
-                                    range(x_start_without_mother[dj],
-                                          x_end_without_mother[dj]))
-                            columns_not_covered = list(all_columns - columns_covered_by_mothers)
-                            #print(columns_covered_by_mothers, "columns_covered_by_mothers")
-                            #print(columns_not_covered, "columns_not_covered")
-                            y_mid = np.append(y_mid, np.ones(len(columns_not_covered) +
-                                                             len(x_start_without_mother),
-                                                             dtype=int) * top)
-                            ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
-                            ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
-                            x_starting = np.append(x_starting, np.array(columns_not_covered, int))
-                            x_starting = np.append(x_starting, x_start_without_mother)
-                            x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
-                            x_ending = np.append(x_ending, x_end_without_mother)
-
-                        ind_args=np.arange(len(y_mid))
-                        #print(ind_args,'ind_args')
-                        for column in range(len(peaks_neg_tot)-1):
-                            #print(column,'column')
-                            ind_args_in_col=ind_args[x_starting==column]
-                            #print('babali2')
-                            #print(ind_args_in_col,'ind_args_in_col')
-                            #print(len(y_mid))
-                            y_mid_column=y_mid[ind_args_in_col]
-                            x_start_column=x_starting[ind_args_in_col]
-                            x_end_column=x_ending[ind_args_in_col]
-                            #print('babali3')
-                            ind_args_col_sorted=np.argsort(y_mid_column)
-                            y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
-                            x_start_by_order.extend(x_start_column[ind_args_col_sorted])
-                            x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
+
+                    peaks_neg_fin_rev.extend(peaks_neg_fin)
+                    if right < peaks_neg_fin_early[-1]:
+                        # all but the last column: interject the preexisting boundary
+                        peaks_neg_fin_rev.append(right)
+                    #print(peaks_neg_fin_rev,'peaks_neg_fin_rev')
+
+                if len(peaks_neg_fin_rev) >= len(peaks_neg_fin_org):
+                    peaks_neg_fin = peaks_neg_fin_rev
+                else:
+                    peaks_neg_fin = peaks_neg_fin_org
+                num_col = len(peaks_neg_fin)
+                #print(peaks_neg_fin,'peaks_neg_fin')
+        except:
+            logger.exception("cannot find peaks consistent with columns")
+        #num_col, peaks_neg_fin = find_num_col(
+        #    regions_without_separators[top:bot,:],
+        #    multiplier=7.0)
+        x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ]
+        x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ]
+        cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ]
+        y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ]
+
+        if right2left_readingorder:
+            x_max_hor_some_new = width_tot - x_min_hor_some
+            x_min_hor_some_new = width_tot - x_max_hor_some
+            x_min_hor_some =list(np.copy(x_min_hor_some_new))
+            x_max_hor_some =list(np.copy(x_max_hor_some_new))
+
+        peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot])
+        #print(peaks_neg_tot,'peaks_neg_tot')
+        peaks_neg_tot_tables.append(peaks_neg_tot)
+
+        all_columns = set(range(len(peaks_neg_tot) - 1))
+        #print("all_columns", all_columns)
+
+        reading_order_type, x_starting, x_ending, y_mid, y_max, \
+            y_mid_without_mother, x_start_without_mother, x_end_without_mother, \
+            there_is_sep_with_child, \
+            y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \
+            new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order(
+                x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some)
+
+        # show multi-column separators
+        # dbg_plt([0, None, top, bot], "multi-column separators in current split", 
+        #         list(zip(peaks_neg_tot[x_starting], peaks_neg_tot[x_ending],
+        #                  y_mid - top, y_max - top)), True)
+
+        if (reading_order_type == 1 or
+            len(y_mid_without_mother) >= 2 or
+            there_is_sep_with_child == 1):
+            # there are top-level multi-colspan horizontal separators which overlap each other
+            # or multiple top-level multi-colspan horizontal separators
+            # or multi-colspan horizontal separators shorter than their respective top-level:
+            # todo: explain how this is dealt with
+            try:
+                y_grenze = top + 300
+                up = (y_mid > top) & (y_mid <= y_grenze)
+
+                args_early_ys=np.arange(len(y_mid))
+                #print(args_early_ys,'args_early_ys')
+                #print(y_mid,'y_mid')
+
+                x_starting_up = x_starting[up]
+                x_ending_up = x_ending[up]
+                y_mid_up = y_mid[up]
+                y_max_up = y_max[up]
+                args_up = args_early_ys[up]
+                #print(args_up,'args_up')
+                #print(y_mid_up,'y_mid_up')
+                #check if there is a big separator in this y_mains0
+                if len(y_mid_up) > 0:
+                    # is there a separator with full-width span?
+                    main_separator = (x_starting_up == 0) & (x_ending_up == len(peaks_neg_tot) - 1)
+                    y_mid_main_separator_up = y_mid_up[main_separator]
+                    y_max_main_separator_up = y_max_up[main_separator]
+                    args_main_to_deleted = args_up[main_separator]
+                    #print(y_mid_main_separator_up,y_max_main_separator_up,args_main_to_deleted,'fffffjammmm')
+                    if len(y_max_main_separator_up):
+                        args_to_be_kept = np.array(list( set(args_early_ys) - set(args_main_to_deleted) ))
+                        #print(args_to_be_kept,'args_to_be_kept')
+                        boxes.append([0, peaks_neg_tot[-1],
+                                      top, y_max_main_separator_up.max()])
+                        # dbg_plt(boxes[-1], "near top main separator box")
+                        top = y_max_main_separator_up.max()
+
+                        #print(top,'top')
+                        y_mid = y_mid[args_to_be_kept]
+                        x_starting = x_starting[args_to_be_kept]
+                        x_ending = x_ending[args_to_be_kept]
+                        y_max = y_max[args_to_be_kept]
+
+                        #print('galdiha')
+                        y_grenze = top + 200
+                        up = (y_mid > top) & (y_mid <= y_grenze)
+                        args_early_ys2 = np.arange(len(y_mid))
+                        x_starting_up = x_starting[up]
+                        x_ending_up = x_ending[up]
+                        y_mid_up = y_mid[up]
+                        y_max_up = y_max[up]
+                        args_up2 = args_early_ys2[up]
+                        #print(y_mid_up,x_starting_up,x_ending_up,'didid')
+                    else:
+                        args_early_ys2 = args_early_ys
+                        args_up2 = args_up
+
+                    nodes_in = set()
+                    for ij in range(len(x_starting_up)):
+                        nodes_in.update(range(x_starting_up[ij],
+                                              x_ending_up[ij]))
+                    #print(nodes_in,'nodes_in')
+                    #print(np.array(range(len(peaks_neg_tot)-1)),'np.array(range(len(peaks_neg_tot)-1))')
+
+                    if nodes_in == set(range(len(peaks_neg_tot)-1)):
+                        pass
+                    elif nodes_in == set(range(1, len(peaks_neg_tot)-1)):
+                        pass
+                    else:
+                        #print('burdaydikh')
+                        args_to_be_kept2 = np.array(list( set(args_early_ys2) - set(args_up2) ))
+
+                        if len(args_to_be_kept2):
+                            #print(args_to_be_kept2, "args_to_be_kept2")
+                            y_mid = y_mid[args_to_be_kept2]
+                            x_starting = x_starting[args_to_be_kept2]
+                            x_ending = x_ending[args_to_be_kept2]
+                            y_max = y_max[args_to_be_kept2]
+
+                #int(top)
+                # order multi-column separators
+                y_mid_by_order=[]
+                x_start_by_order=[]
+                x_end_by_order=[]
+                if (reading_order_type == 1 or
+                    len(x_end_with_child_without_mother) == 0):
+                    if reading_order_type == 1:
+                        # there are top-level multi-colspan horizontal separators which overlap each other
+                        #print("adding all columns at top because of multiple overlapping mothers")
+                        y_mid_by_order.append(top)
+                        x_start_by_order.append(0)
+                        x_end_by_order.append(len(peaks_neg_tot)-2)
                     else:
+                        # there are no top-level multi-colspan horizontal separators which themselves
+                        # contain shorter multi-colspan separators
                         #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
                         columns_covered_by_mothers = set()
                         for dj in range(len(x_start_without_mother)):
@@ -1895,212 +1858,170 @@ def return_boxes_of_images_by_order_of_reading_new(
                         x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
                         x_ending = np.append(x_ending, x_end_without_mother)
 
-                        columns_covered_by_mothers_with_child = set()
-                        for dj in range(len(x_end_with_child_without_mother)):
-                            columns_covered_by_mothers_with_child.update(
-                                range(x_start_with_child_without_mother[dj],
-                                      x_end_with_child_without_mother[dj]))
-                        #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child")
-                        columns_not_covered_by_mothers_with_child = list(
-                            all_columns - columns_covered_by_mothers_with_child)
-                        #indexes_to_be_spanned=[]
-                        for i_s in range(len(x_end_with_child_without_mother)):
-                            columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s])
-                        columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child)
-                        #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child")
-                        ind_args = np.arange(len(y_mid))
-                        for i_s_nc in columns_not_covered_by_mothers_with_child:
-                            if i_s_nc in x_start_with_child_without_mother:
-                                # use only seps with mother's span ("biggest")
-                                #print("i_s_nc", i_s_nc)
-                                x_end_biggest_column = \
-                                    x_end_with_child_without_mother[
-                                        x_start_with_child_without_mother == i_s_nc][0]
-                                args_all_biggest_seps = \
-                                    ind_args[(x_starting == i_s_nc) &
-                                             (x_ending == x_end_biggest_column)]
-                                y_mid_column_nc = y_mid[args_all_biggest_seps]
-                                #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child")
-                                #x_start_column_nc = x_starting[args_all_biggest_seps]
-                                #x_end_column_nc = x_ending[args_all_biggest_seps]
-                                y_mid_column_nc = np.sort(y_mid_column_nc)
-                                #print(y_mid_column_nc, "y_mid_column_nc (sorted)")
-                                for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)):
-                                    #print("i_c", i_c)
-                                    #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc")
-                                    ind_all_seps_between_nm_wc = \
-                                        ind_args[(y_mid > nc_top) &
-                                                 (y_mid < nc_bot) &
-                                                 (x_starting >= i_s_nc) &
-                                                 (x_ending <= x_end_biggest_column)]
-                                    y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc]
-                                    x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc]
-                                    x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc]
-
-                                    columns_covered_by_mothers = set()
-                                    for dj in range(len(ind_all_seps_between_nm_wc)):
-                                        columns_covered_by_mothers.update(
-                                            range(x_starting_all_between_nm_wc[dj],
-                                                  x_ending_all_between_nm_wc[dj]))
-                                    #print(columns_covered_by_mothers, "columns_covered_by_mothers")
-                                    child_columns = set(range(i_s_nc, x_end_biggest_column))
-                                    columns_not_covered = list(child_columns - columns_covered_by_mothers)
-                                    #print(child_columns, "child_columns")
-                                    #print(columns_not_covered, "columns_not_covered")
-
-                                    if len(ind_all_seps_between_nm_wc):
-                                        biggest = np.argmax(x_ending_all_between_nm_wc -
-                                                            x_starting_all_between_nm_wc)
-                                        #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc")
-                                        #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest],
-                                                                  x_ending_all_between_nm_wc[biggest]), "biggest")
-                                        if columns_covered_by_mothers == set(
-                                                range(x_starting_all_between_nm_wc[biggest],
-                                                      x_ending_all_between_nm_wc[biggest])):
-                                            # single biggest accounts for all covered columns alone,
-                                            # this separator should be extended to cover all
-                                            seps_too_close_to_top_separator = \
-                                                ((y_mid_all_between_nm_wc > nc_top) &
-                                                 (y_mid_all_between_nm_wc <= nc_top + 500))
-                                            if (np.count_nonzero(seps_too_close_to_top_separator) and
-                                                np.count_nonzero(seps_too_close_to_top_separator) <
-                                                len(ind_all_seps_between_nm_wc)):
-                                                #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator")
-                                                y_mid_all_between_nm_wc = \
-                                                    y_mid_all_between_nm_wc[~seps_too_close_to_top_separator]
-                                                x_starting_all_between_nm_wc = \
-                                                    x_starting_all_between_nm_wc[~seps_too_close_to_top_separator]
-                                                x_ending_all_between_nm_wc = \
-                                                    x_ending_all_between_nm_wc[~seps_too_close_to_top_separator]
-
-                                            y_mid_all_between_nm_wc = np.append(
-                                                y_mid_all_between_nm_wc, nc_top)
-                                            x_starting_all_between_nm_wc = np.append(
-                                                x_starting_all_between_nm_wc, i_s_nc)
-                                            x_ending_all_between_nm_wc = np.append(
-                                                x_ending_all_between_nm_wc, x_end_biggest_column)
-                                        else:
-                                            y_mid_all_between_nm_wc = np.append(
-                                                y_mid_all_between_nm_wc, nc_top)
-                                            x_starting_all_between_nm_wc = np.append(
-                                                x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest])
-                                            x_ending_all_between_nm_wc = np.append(
-                                                x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest])
-
-                                    if len(columns_not_covered):
-                                        y_mid_all_between_nm_wc = np.append(
-                                            y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered))
-                                        x_starting_all_between_nm_wc = np.append(
-                                            x_starting_all_between_nm_wc, np.array(columns_not_covered, int))
-                                        x_ending_all_between_nm_wc = np.append(
-                                            x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1)
-
-                                    ind_args_between=np.arange(len(x_ending_all_between_nm_wc))
-                                    for column in range(int(i_s_nc), int(x_end_biggest_column)):
-                                        ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column]
-                                        #print('babali2')
-                                        #print(ind_args_in_col,'ind_args_in_col')
-                                        #print(len(y_mid))
-                                        y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col]
-                                        x_start_column=x_starting_all_between_nm_wc[ind_args_in_col]
-                                        x_end_column=x_ending_all_between_nm_wc[ind_args_in_col]
-                                        #print('babali3')
-                                        ind_args_col_sorted=np.argsort(y_mid_column)
-                                        y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
-                                        x_start_by_order.extend(x_start_column[ind_args_col_sorted])
-                                        x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
-                            else:
-                                #print(i_s_nc,'column not covered by mothers with child')
-                                ind_args_in_col=ind_args[x_starting==i_s_nc]
-                                #print('babali2')
-                                #print(ind_args_in_col,'ind_args_in_col')
-                                #print(len(y_mid))
-                                y_mid_column=y_mid[ind_args_in_col]
-                                x_start_column=x_starting[ind_args_in_col]
-                                x_end_column=x_ending[ind_args_in_col]
-                                #print('babali3')
-                                ind_args_col_sorted = np.argsort(y_mid_column)
-                                y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
-                                x_start_by_order.extend(x_start_column[ind_args_col_sorted])
-                                x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
-
-                    # create single-column boxes from multi-column separators
-                    y_mid_by_order = np.array(y_mid_by_order)
-                    x_start_by_order = np.array(x_start_by_order)
-                    x_end_by_order = np.array(x_end_by_order)
-                    for il in range(len(y_mid_by_order)):
-                        #print(il, "il")
-                        y_mid_itself = y_mid_by_order[il]
-                        x_start_itself = x_start_by_order[il]
-                        x_end_itself = x_end_by_order[il]
-                        for column in range(int(x_start_itself), int(x_end_itself)+1):
-                            #print(column,'cols')
-                            #print('burda')
-                            #print('burda2')
-                            y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
-                                                        (column >= x_start_by_order) &
-                                                        (column <= x_end_by_order)]
-                            y_mid_next = y_mid_next.min(initial=bot)
-                            #print(y_mid_next,'y_mid_next')
-                            #print(y_mid_itself,'y_mid_itself')
-                            boxes.append([peaks_neg_tot[column],
-                                          peaks_neg_tot[column+1],
-                                          y_mid_itself,
-                                          y_mid_next])
-                            # dbg_plt(boxes[-1], "A column %d box" % (column + 1))
-                except:
-                    logger.exception("cannot assign boxes")
-                    boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
-                                  top, bot])
-                    # dbg_plt(boxes[-1], "fallback box")
-            else:
-                # order multi-column separators
-                y_mid_by_order=[]
-                x_start_by_order=[]
-                x_end_by_order=[]
-                if len(x_starting)>0:
-                    columns_covered_by_seps_covered_more_than_2col = set()
-                    for dj in range(len(x_starting)):
-                        if set(range(x_starting[dj], x_ending[dj])) != all_columns:
-                            columns_covered_by_seps_covered_more_than_2col.update(
-                                range(x_starting[dj], x_ending[dj]))
-                    columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col)
-
-                    y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1,
-                                                     dtype=int) * top)
-                    ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
-                    ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
-                    x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
-                    x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
-                    if len(new_main_sep_y) > 0:
-                        x_starting = np.append(x_starting, 0)
-                        x_ending = np.append(x_ending, len(peaks_neg_tot) - 1)
-                    else:
-                        x_starting = np.append(x_starting, x_starting[0])
-                        x_ending = np.append(x_ending, x_ending[0])
+                    ind_args=np.arange(len(y_mid))
+                    #print(ind_args,'ind_args')
+                    for column in range(len(peaks_neg_tot)-1):
+                        #print(column,'column')
+                        ind_args_in_col=ind_args[x_starting==column]
+                        #print('babali2')
+                        #print(ind_args_in_col,'ind_args_in_col')
+                        #print(len(y_mid))
+                        y_mid_column=y_mid[ind_args_in_col]
+                        x_start_column=x_starting[ind_args_in_col]
+                        x_end_column=x_ending[ind_args_in_col]
+                        #print('babali3')
+                        ind_args_col_sorted=np.argsort(y_mid_column)
+                        y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
+                        x_start_by_order.extend(x_start_column[ind_args_col_sorted])
+                        x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
                 else:
-                    columns_not_covered = list(all_columns)
-                    y_mid = np.append(y_mid, np.ones(len(columns_not_covered),
+                    #print(x_start_without_mother,x_end_without_mother,peaks_neg_tot,'dodo')
+                    columns_covered_by_mothers = set()
+                    for dj in range(len(x_start_without_mother)):
+                        columns_covered_by_mothers.update(
+                            range(x_start_without_mother[dj],
+                                  x_end_without_mother[dj]))
+                    columns_not_covered = list(all_columns - columns_covered_by_mothers)
+                    #print(columns_covered_by_mothers, "columns_covered_by_mothers")
+                    #print(columns_not_covered, "columns_not_covered")
+                    y_mid = np.append(y_mid, np.ones(len(columns_not_covered) +
+                                                     len(x_start_without_mother),
                                                      dtype=int) * top)
                     ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
                     ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
-                    x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
-                    x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
-
-                ind_args = np.arange(len(y_mid))
-
-                for column in range(len(peaks_neg_tot)-1):
-                    #print(column,'column')
-                    ind_args_in_col=ind_args[x_starting==column]
-                    #print(len(y_mid))
-                    y_mid_column=y_mid[ind_args_in_col]
-                    x_start_column=x_starting[ind_args_in_col]
-                    x_end_column=x_ending[ind_args_in_col]
+                    x_starting = np.append(x_starting, np.array(columns_not_covered, int))
+                    x_starting = np.append(x_starting, x_start_without_mother)
+                    x_ending = np.append(x_ending, np.array(columns_not_covered, int) + 1)
+                    x_ending = np.append(x_ending, x_end_without_mother)
+
+                    columns_covered_by_mothers_with_child = set()
+                    for dj in range(len(x_end_with_child_without_mother)):
+                        columns_covered_by_mothers_with_child.update(
+                            range(x_start_with_child_without_mother[dj],
+                                  x_end_with_child_without_mother[dj]))
+                    #print(columns_covered_by_mothers_with_child, "columns_covered_by_mothers_with_child")
+                    columns_not_covered_by_mothers_with_child = list(
+                        all_columns - columns_covered_by_mothers_with_child)
+                    #indexes_to_be_spanned=[]
+                    for i_s in range(len(x_end_with_child_without_mother)):
+                        columns_not_covered_by_mothers_with_child.append(x_start_with_child_without_mother[i_s])
+                    columns_not_covered_by_mothers_with_child = np.sort(columns_not_covered_by_mothers_with_child)
+                    #print(columns_not_covered_by_mothers_with_child, "columns_not_covered_by_mothers_with_child")
+                    ind_args = np.arange(len(y_mid))
+                    for i_s_nc in columns_not_covered_by_mothers_with_child:
+                        if i_s_nc in x_start_with_child_without_mother:
+                            # use only seps with mother's span ("biggest")
+                            #print("i_s_nc", i_s_nc)
+                            x_end_biggest_column = \
+                                x_end_with_child_without_mother[
+                                    x_start_with_child_without_mother == i_s_nc][0]
+                            args_all_biggest_seps = \
+                                ind_args[(x_starting == i_s_nc) &
+                                         (x_ending == x_end_biggest_column)]
+                            y_mid_column_nc = y_mid[args_all_biggest_seps]
+                            #print("%d:%d" % (i_s_nc, x_end_biggest_column), "columns covered by mother with child")
+                            #x_start_column_nc = x_starting[args_all_biggest_seps]
+                            #x_end_column_nc = x_ending[args_all_biggest_seps]
+                            y_mid_column_nc = np.sort(y_mid_column_nc)
+                            #print(y_mid_column_nc, "y_mid_column_nc (sorted)")
+                            for nc_top, nc_bot in pairwise(np.append(y_mid_column_nc, bot)):
+                                #print("i_c", i_c)
+                                #print("%d:%d" % (nc_top, nc_bot), "y_mid_column_nc")
+                                ind_all_seps_between_nm_wc = \
+                                    ind_args[(y_mid > nc_top) &
+                                             (y_mid < nc_bot) &
+                                             (x_starting >= i_s_nc) &
+                                             (x_ending <= x_end_biggest_column)]
+                                y_mid_all_between_nm_wc = y_mid[ind_all_seps_between_nm_wc]
+                                x_starting_all_between_nm_wc = x_starting[ind_all_seps_between_nm_wc]
+                                x_ending_all_between_nm_wc = x_ending[ind_all_seps_between_nm_wc]
+
+                                columns_covered_by_mothers = set()
+                                for dj in range(len(ind_all_seps_between_nm_wc)):
+                                    columns_covered_by_mothers.update(
+                                        range(x_starting_all_between_nm_wc[dj],
+                                              x_ending_all_between_nm_wc[dj]))
+                                #print(columns_covered_by_mothers, "columns_covered_by_mothers")
+                                child_columns = set(range(i_s_nc, x_end_biggest_column))
+                                columns_not_covered = list(child_columns - columns_covered_by_mothers)
+                                #print(child_columns, "child_columns")
+                                #print(columns_not_covered, "columns_not_covered")
+
+                                if len(ind_all_seps_between_nm_wc):
+                                    biggest = np.argmax(x_ending_all_between_nm_wc -
+                                                        x_starting_all_between_nm_wc)
+                                    #print(ind_all_seps_between_nm_wc, "ind_all_seps_between_nm_wc")
+                                    #print(biggest, "%d:%d" % (x_starting_all_between_nm_wc[biggest],
+                                                              x_ending_all_between_nm_wc[biggest]), "biggest")
+                                    if columns_covered_by_mothers == set(
+                                            range(x_starting_all_between_nm_wc[biggest],
+                                                  x_ending_all_between_nm_wc[biggest])):
+                                        # single biggest accounts for all covered columns alone,
+                                        # this separator should be extended to cover all
+                                        seps_too_close_to_top_separator = \
+                                            ((y_mid_all_between_nm_wc > nc_top) &
+                                             (y_mid_all_between_nm_wc <= nc_top + 500))
+                                        if (np.count_nonzero(seps_too_close_to_top_separator) and
+                                            np.count_nonzero(seps_too_close_to_top_separator) <
+                                            len(ind_all_seps_between_nm_wc)):
+                                            #print(seps_too_close_to_top_separator, "seps_too_close_to_top_separator")
+                                            y_mid_all_between_nm_wc = \
+                                                y_mid_all_between_nm_wc[~seps_too_close_to_top_separator]
+                                            x_starting_all_between_nm_wc = \
+                                                x_starting_all_between_nm_wc[~seps_too_close_to_top_separator]
+                                            x_ending_all_between_nm_wc = \
+                                                x_ending_all_between_nm_wc[~seps_too_close_to_top_separator]
 
-                    ind_args_col_sorted = np.argsort(y_mid_column)
-                    y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
-                    x_start_by_order.extend(x_start_column[ind_args_col_sorted])
-                    x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
+                                        y_mid_all_between_nm_wc = np.append(
+                                            y_mid_all_between_nm_wc, nc_top)
+                                        x_starting_all_between_nm_wc = np.append(
+                                            x_starting_all_between_nm_wc, i_s_nc)
+                                        x_ending_all_between_nm_wc = np.append(
+                                            x_ending_all_between_nm_wc, x_end_biggest_column)
+                                    else:
+                                        y_mid_all_between_nm_wc = np.append(
+                                            y_mid_all_between_nm_wc, nc_top)
+                                        x_starting_all_between_nm_wc = np.append(
+                                            x_starting_all_between_nm_wc, x_starting_all_between_nm_wc[biggest])
+                                        x_ending_all_between_nm_wc = np.append(
+                                            x_ending_all_between_nm_wc, x_ending_all_between_nm_wc[biggest])
+
+                                if len(columns_not_covered):
+                                    y_mid_all_between_nm_wc = np.append(
+                                        y_mid_all_between_nm_wc, [nc_top] * len(columns_not_covered))
+                                    x_starting_all_between_nm_wc = np.append(
+                                        x_starting_all_between_nm_wc, np.array(columns_not_covered, int))
+                                    x_ending_all_between_nm_wc = np.append(
+                                        x_ending_all_between_nm_wc, np.array(columns_not_covered, int) + 1)
+
+                                ind_args_between=np.arange(len(x_ending_all_between_nm_wc))
+                                for column in range(int(i_s_nc), int(x_end_biggest_column)):
+                                    ind_args_in_col=ind_args_between[x_starting_all_between_nm_wc==column]
+                                    #print('babali2')
+                                    #print(ind_args_in_col,'ind_args_in_col')
+                                    #print(len(y_mid))
+                                    y_mid_column=y_mid_all_between_nm_wc[ind_args_in_col]
+                                    x_start_column=x_starting_all_between_nm_wc[ind_args_in_col]
+                                    x_end_column=x_ending_all_between_nm_wc[ind_args_in_col]
+                                    #print('babali3')
+                                    ind_args_col_sorted=np.argsort(y_mid_column)
+                                    y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
+                                    x_start_by_order.extend(x_start_column[ind_args_col_sorted])
+                                    x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
+                        else:
+                            #print(i_s_nc,'column not covered by mothers with child')
+                            ind_args_in_col=ind_args[x_starting==i_s_nc]
+                            #print('babali2')
+                            #print(ind_args_in_col,'ind_args_in_col')
+                            #print(len(y_mid))
+                            y_mid_column=y_mid[ind_args_in_col]
+                            x_start_column=x_starting[ind_args_in_col]
+                            x_end_column=x_ending[ind_args_in_col]
+                            #print('babali3')
+                            ind_args_col_sorted = np.argsort(y_mid_column)
+                            y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
+                            x_start_by_order.extend(x_start_column[ind_args_col_sorted])
+                            x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
 
                 # create single-column boxes from multi-column separators
                 y_mid_by_order = np.array(y_mid_by_order)
@@ -2109,23 +2030,101 @@ def return_boxes_of_images_by_order_of_reading_new(
                 for il in range(len(y_mid_by_order)):
                     #print(il, "il")
                     y_mid_itself = y_mid_by_order[il]
-                    #print(y_mid_itself,'y_mid_itself')
                     x_start_itself = x_start_by_order[il]
                     x_end_itself = x_end_by_order[il]
-                    for column in range(x_start_itself, x_end_itself+1):
+                    for column in range(int(x_start_itself), int(x_end_itself)+1):
                         #print(column,'cols')
+                        #print('burda')
                         #print('burda2')
                         y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
                                                     (column >= x_start_by_order) &
                                                     (column <= x_end_by_order)]
-                        #print(y_mid_next,'y_mid_next')
                         y_mid_next = y_mid_next.min(initial=bot)
                         #print(y_mid_next,'y_mid_next')
+                        #print(y_mid_itself,'y_mid_itself')
                         boxes.append([peaks_neg_tot[column],
                                       peaks_neg_tot[column+1],
                                       y_mid_itself,
                                       y_mid_next])
-                        # dbg_plt(boxes[-1], "B column %d box" % (column + 1))
+                        # dbg_plt(boxes[-1], "A column %d box" % (column + 1))
+            except:
+                logger.exception("cannot assign boxes")
+                boxes.append([0, peaks_neg_tot[len(peaks_neg_tot)-1],
+                              top, bot])
+                # dbg_plt(boxes[-1], "fallback box")
+        else:
+            # order multi-column separators
+            y_mid_by_order=[]
+            x_start_by_order=[]
+            x_end_by_order=[]
+            if len(x_starting)>0:
+                columns_covered_by_seps_covered_more_than_2col = set()
+                for dj in range(len(x_starting)):
+                    if set(range(x_starting[dj], x_ending[dj])) != all_columns:
+                        columns_covered_by_seps_covered_more_than_2col.update(
+                            range(x_starting[dj], x_ending[dj]))
+                columns_not_covered = list(all_columns - columns_covered_by_seps_covered_more_than_2col)
+
+                y_mid = np.append(y_mid, np.ones(len(columns_not_covered) + 1,
+                                                 dtype=int) * top)
+                ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
+                ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
+                x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
+                x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
+                if len(new_main_sep_y) > 0:
+                    x_starting = np.append(x_starting, 0)
+                    x_ending = np.append(x_ending, len(peaks_neg_tot) - 1)
+                else:
+                    x_starting = np.append(x_starting, x_starting[0])
+                    x_ending = np.append(x_ending, x_ending[0])
+            else:
+                columns_not_covered = list(all_columns)
+                y_mid = np.append(y_mid, np.ones(len(columns_not_covered),
+                                                 dtype=int) * top)
+                ##y_mid_by_order = np.append(y_mid_by_order, [top] * len(columns_not_covered))
+                ##x_start_by_order = np.append(x_start_by_order, [0] * len(columns_not_covered))
+                x_starting = np.append(x_starting, np.array(columns_not_covered, x_starting.dtype))
+                x_ending = np.append(x_ending, np.array(columns_not_covered, x_ending.dtype) + 1)
+
+            ind_args = np.arange(len(y_mid))
+
+            for column in range(len(peaks_neg_tot)-1):
+                #print(column,'column')
+                ind_args_in_col=ind_args[x_starting==column]
+                #print(len(y_mid))
+                y_mid_column=y_mid[ind_args_in_col]
+                x_start_column=x_starting[ind_args_in_col]
+                x_end_column=x_ending[ind_args_in_col]
+
+                ind_args_col_sorted = np.argsort(y_mid_column)
+                y_mid_by_order.extend(y_mid_column[ind_args_col_sorted])
+                x_start_by_order.extend(x_start_column[ind_args_col_sorted])
+                x_end_by_order.extend(x_end_column[ind_args_col_sorted] - 1)
+
+            # create single-column boxes from multi-column separators
+            y_mid_by_order = np.array(y_mid_by_order)
+            x_start_by_order = np.array(x_start_by_order)
+            x_end_by_order = np.array(x_end_by_order)
+            for il in range(len(y_mid_by_order)):
+                #print(il, "il")
+                y_mid_itself = y_mid_by_order[il]
+                #print(y_mid_itself,'y_mid_itself')
+                x_start_itself = x_start_by_order[il]
+                x_end_itself = x_end_by_order[il]
+                for column in range(x_start_itself, x_end_itself+1):
+                    #print(column,'cols')
+                    #print('burda2')
+                    y_mid_next = y_mid_by_order[(y_mid_itself < y_mid_by_order) &
+                                                (column >= x_start_by_order) &
+                                                (column <= x_end_by_order)]
+                    #print(y_mid_next,'y_mid_next')
+                    y_mid_next = y_mid_next.min(initial=bot)
+                    #print(y_mid_next,'y_mid_next')
+                    boxes.append([peaks_neg_tot[column],
+                                  peaks_neg_tot[column+1],
+                                  y_mid_itself,
+                                  y_mid_next])
+                    # dbg_plt(boxes[-1], "B column %d box" % (column + 1))
 
     if right2left_readingorder:
         peaks_neg_tot_tables_new = []

From a2a9fe51175cfd11bc62d1e917bf79b299a7846e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 02:35:04 +0200
Subject: [PATCH 19/21] `delete_separator_around`: simplify, eynollah:
 identifiers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- use array instead of list operations
- rename identifiers:
  - `pixel` → `label`
  - `line` → `sep`
---
 src/eynollah/eynollah.py | 104 ++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 57 deletions(-)

diff --git a/src/eynollah/eynollah.py b/src/eynollah/eynollah.py
index 08ffed7..eee3777 100644
--- a/src/eynollah/eynollah.py
+++ b/src/eynollah/eynollah.py
@@ -2669,45 +2669,35 @@ def check_iou_of_bounding_box_and_contour_for_tables(
 
         return layout_org, contours_new
 
-    def delete_separator_around(self, spliter_y,peaks_neg,image_by_region, pixel_line, pixel_table):
+    def delete_separator_around(self, splitter_y, peaks_neg, image_by_region, label_seps, label_table):
         # format of subboxes: box=[x1, x2 , y1, y2]
         pix_del = 100
-        if len(image_by_region.shape)==3:
-            for i in range(len(spliter_y)-1):
-                for j in range(1,len(peaks_neg[i])-1):
-                    ys = slice(int(spliter_y[i]),
-                               int(spliter_y[i+1]))
-                    xs = slice(peaks_neg[i][j] - pix_del,
-                               peaks_neg[i][j] + pix_del)
-                    image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_line] = 0
-                    image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_line] = 0
-                    image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_line] = 0
-
-                    image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_table] = 0
-                    image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_table] = 0
-                    image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_table] = 0
-        else:
-            for i in range(len(spliter_y)-1):
-                for j in range(1,len(peaks_neg[i])-1):
-                    ys = slice(int(spliter_y[i]),
-                               int(spliter_y[i+1]))
-                    xs = slice(peaks_neg[i][j] - pix_del,
-                               peaks_neg[i][j] + pix_del)
-                    image_by_region[ys,xs][image_by_region[ys,xs]==pixel_line] = 0
-                    image_by_region[ys,xs][image_by_region[ys,xs]==pixel_table] = 0
+        for i in range(len(splitter_y)-1):
+            for j in range(1,len(peaks_neg[i])-1):
+                where = np.index_exp[splitter_y[i]:
+                                     splitter_y[i+1],
+                                     peaks_neg[i][j] - pix_del:
+                                     peaks_neg[i][j] + pix_del,
+                                     :]
+                if image_by_region.ndim < 3:
+                    where = where[:2]
+                else:
+                    print("image_by_region ndim is 3!") # rs
+                image_by_region[where][image_by_region[where] == label_seps] = 0
+                image_by_region[where][image_by_region[where] == label_table] = 0
         return image_by_region
 
     def add_tables_heuristic_to_layout(
             self, image_regions_eraly_p, boxes,
-            slope_mean_hor, spliter_y, peaks_neg_tot, image_revised,
-            num_col_classifier, min_area, pixel_line):
+            slope_mean_hor, splitter_y, peaks_neg_tot, image_revised,
+            num_col_classifier, min_area, label_seps):
 
-        pixel_table =10
-        image_revised_1 = self.delete_separator_around(spliter_y, peaks_neg_tot, image_revised, pixel_line, pixel_table)
+        label_table =10
+        image_revised_1 = self.delete_separator_around(splitter_y, peaks_neg_tot, image_revised, label_seps, label_table)
 
         try:
-            image_revised_1[:,:30][image_revised_1[:,:30]==pixel_line] = 0
-            image_revised_1[:,-30:][image_revised_1[:,-30:]==pixel_line] = 0
+            image_revised_1[:,:30][image_revised_1[:,:30]==label_seps] = 0
+            image_revised_1[:,-30:][image_revised_1[:,-30:]==label_seps] = 0
         except:
             pass
         boxes = np.array(boxes, dtype=int) # to be on the safe side
@@ -2718,7 +2708,7 @@ def add_tables_heuristic_to_layout(
             _, thresh = cv2.threshold(image_col, 0, 255, 0)
             contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
 
-            if indiv==pixel_table:
+            if indiv==label_table:
                 main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy,
                                                                      max_area=1, min_area=0.001)
             else:
@@ -2734,11 +2724,11 @@ def add_tables_heuristic_to_layout(
                 box_xs = slice(*boxes[i][0:2])
                 image_box = img_comm[box_ys, box_xs]
                 try:
-                    image_box_tabels_1 = (image_box == pixel_table) * 1
+                    image_box_tabels_1 = (image_box == label_table) * 1
                     contours_tab,_=return_contours_of_image(image_box_tabels_1)
                     contours_tab=filter_contours_area_of_image_tables(image_box_tabels_1,contours_tab,_,1,0.003)
-                    image_box_tabels_1 = (image_box == pixel_line).astype(np.uint8) * 1
-                    image_box_tabels_and_m_text = ( (image_box == pixel_table) |
+                    image_box_tabels_1 = (image_box == label_seps).astype(np.uint8) * 1
+                    image_box_tabels_and_m_text = ( (image_box == label_table) |
                                                     (image_box == 1) ).astype(np.uint8) * 1
 
                     image_box_tabels_1 = cv2.dilate(image_box_tabels_1, KERNEL, iterations=5)
@@ -2800,7 +2790,7 @@ def add_tables_heuristic_to_layout(
                     y_up_tabs=[]
 
                 for ii in range(len(y_up_tabs)):
-                    image_box[y_up_tabs[ii]:y_down_tabs[ii]] = pixel_table
+                    image_box[y_up_tabs[ii]:y_down_tabs[ii]] = label_table
 
                 image_revised_last[box_ys, box_xs] = image_box
         else:
@@ -2811,14 +2801,14 @@ def add_tables_heuristic_to_layout(
                 image_revised_last[box_ys, box_xs] = image_box
 
         if num_col_classifier==1:
-            img_tables_col_1 = (image_revised_last == pixel_table).astype(np.uint8)
+            img_tables_col_1 = (image_revised_last == label_table).astype(np.uint8)
             contours_table_col1, _ = return_contours_of_image(img_tables_col_1)
 
             _,_ ,_ , _, y_min_tab_col1 ,y_max_tab_col1, _= find_new_features_of_contours(contours_table_col1)
 
             if len(y_min_tab_col1)>0:
                 for ijv in range(len(y_min_tab_col1)):
-                    image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = pixel_table
+                    image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = label_table
         return image_revised_last
 
     def get_tables_from_model(self, img, num_col_classifier):
@@ -3153,14 +3143,14 @@ def run_boxes_no_full_layout(
             text_regions_p_1_n = None
             textline_mask_tot_d = None
             regions_without_separators_d = None
-        pixel_lines = 3
+        label_seps = 3
         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-            _, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document(
-                text_regions_p, num_col_classifier, self.tables, pixel_lines)
+            _, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document(
+                text_regions_p, num_col_classifier, self.tables, label_seps)
 
         if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
-            _, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
-                text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines)
+            _, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
+                text_regions_p_1_n, num_col_classifier, self.tables, label_seps)
         #print(time.time()-t_0_box,'time box in 2')
         self.logger.info("num_col_classifier: %s", num_col_classifier)
 
@@ -3175,7 +3165,7 @@ def run_boxes_no_full_layout(
         t1 = time.time()
         if np.abs(slope_deskew) < SLOPE_THRESHOLD:
             boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(
-                splitter_y_new, regions_without_separators, matrix_of_lines_ch,
+                splitter_y_new, regions_without_separators, matrix_of_seps_ch,
                 num_col_classifier, erosion_hurts, self.tables, self.right2left)
             boxes_d = None
             self.logger.debug("len(boxes): %s", len(boxes))
@@ -3187,17 +3177,17 @@ def run_boxes_no_full_layout(
                 else:
                     text_regions_p_tables = np.copy(text_regions_p)
                     text_regions_p_tables[(table_prediction == 1)] = 10
-                    pixel_line = 3
+                    label_seps = 3
                     img_revised_tab2 = self.add_tables_heuristic_to_layout(
                         text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables,
-                        num_col_classifier , 0.000005, pixel_line)
+                        num_col_classifier , 0.000005, label_seps)
                     #print(time.time()-t_0_box,'time box in 3.2')
                     img_revised_tab2, contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables(
                         img_revised_tab2, table_prediction, 10, num_col_classifier)
                     #print(time.time()-t_0_box,'time box in 3.3')
         else:
             boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(
-                splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d,
+                splitter_y_new_d, regions_without_separators_d, matrix_of_seps_ch_d,
                 num_col_classifier, erosion_hurts, self.tables, self.right2left)
             boxes = None
             self.logger.debug("len(boxes): %s", len(boxes_d))
@@ -3210,11 +3200,11 @@ def run_boxes_no_full_layout(
                     text_regions_p_tables = np.round(text_regions_p_tables)
                     text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10
 
-                    pixel_line = 3
+                    label_seps = 3
                     img_revised_tab2 = self.add_tables_heuristic_to_layout(
                         text_regions_p_tables, boxes_d, 0, splitter_y_new_d,
                         peaks_neg_tot_tables_d, text_regions_p_tables,
-                        num_col_classifier, 0.000005, pixel_line)
+                        num_col_classifier, 0.000005, label_seps)
                     img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables(
                         img_revised_tab2, table_prediction_n, 10, num_col_classifier)
 
@@ -3333,14 +3323,14 @@ def run_boxes_full_layout(
                 regions_without_separators = (text_regions_p[:,:] == 1)*1
                 regions_without_separators[table_prediction == 1] = 1
 
-                pixel_lines=3
+                label_seps=3
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
                     num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document(
-                        text_regions_p, num_col_classifier, self.tables, pixel_lines)
+                        text_regions_p, num_col_classifier, self.tables, label_seps)
 
                 if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
                     num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
-                        text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines)
+                        text_regions_p_1_n, num_col_classifier, self.tables, label_seps)
 
                 if num_col_classifier>=3:
                     if np.abs(slope_deskew) < SLOPE_THRESHOLD:
@@ -3359,10 +3349,10 @@ def run_boxes_full_layout(
                         num_col_classifier, erosion_hurts, self.tables, self.right2left)
                     text_regions_p_tables = np.copy(text_regions_p)
                     text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10
-                    pixel_line = 3
+                    label_seps = 3
                     img_revised_tab2 = self.add_tables_heuristic_to_layout(
                         text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables,
-                        num_col_classifier , 0.000005, pixel_line)
+                        num_col_classifier , 0.000005, label_seps)
 
                     img_revised_tab2,contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables(
                         img_revised_tab2, table_prediction, 10, num_col_classifier)
@@ -3374,11 +3364,11 @@ def run_boxes_full_layout(
                     text_regions_p_tables = np.round(text_regions_p_tables)
                     text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10
 
-                    pixel_line = 3
+                    label_seps = 3
                     img_revised_tab2 = self.add_tables_heuristic_to_layout(
                         text_regions_p_tables, boxes_d, 0, splitter_y_new_d,
                         peaks_neg_tot_tables_d, text_regions_p_tables,
-                        num_col_classifier, 0.000005, pixel_line)
+                        num_col_classifier, 0.000005, label_seps)
 
                     img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables(
                         img_revised_tab2, table_prediction_n, 10, num_col_classifier)
@@ -4721,12 +4711,12 @@ def deskew(polygon):
                         regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6)
 
                 if np.abs(slope_deskew) < SLOPE_THRESHOLD:
-                    boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(
+                    boxes, _ = return_boxes_of_images_by_order_of_reading_new(
                         splitter_y_new, regions_without_separators, matrix_of_lines_ch,
                         num_col_classifier, erosion_hurts, self.tables, self.right2left,
                         logger=self.logger)
                 else:
-                    boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(
+                    boxes_d, _ = return_boxes_of_images_by_order_of_reading_new(
                         splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d,
                         num_col_classifier, erosion_hurts, self.tables, self.right2left,
                         logger=self.logger)

From 3367462d181bca16316e84957299e0abb08ec0d8 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 22:46:46 +0200
Subject: [PATCH 20/21] `return_boxes_of_images_by_order_of_reading_new`:
 change arg order

---
 src/eynollah/utils/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index f30d55e..a163fad 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -33,7 +33,7 @@ def pairwise(iterable):
         a = b
 
 def return_x_start_end_mothers_childs_and_type_of_reading_order(
-        x_min_hor_some, x_max_hor_some, cy_hor_some, peak_points, y_max_hor_some):
+        peak_points, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some):
     """
     Analyse which separators overlap multiple column candidates,
     and how they overlap each other.
@@ -54,10 +54,10 @@ def return_x_start_end_mothers_childs_and_type_of_reading_order(
     with no mother, specifically (and thus, no simple box separation is possible).
 
     Arguments:
+        * the x column coordinates
         * the x start column index of the raw separators
         * the x end column index of the raw separators
         * the y center coordinate of the raw separators
-        * the x column coordinates
         * the y end coordinate of the raw separators
 
     Returns:
@@ -1736,7 +1736,7 @@ def return_boxes_of_images_by_order_of_reading_new(
             there_is_sep_with_child, \
             y_mid_with_child_without_mother, x_start_with_child_without_mother, x_end_with_child_without_mother, \
             new_main_sep_y = return_x_start_end_mothers_childs_and_type_of_reading_order(
-                x_min_hor_some, x_max_hor_some, cy_hor_some, peaks_neg_tot, y_max_hor_some)
+                peaks_neg_tot, x_min_hor_some, x_max_hor_some, cy_hor_some, y_max_hor_some)
 
         # show multi-column separators
         # dbg_plt([0, None, top, bot], "multi-column separators in current split", 

From 19b2c3fa424f8750e093a2fb88d7e6e381daeaab Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 24 Oct 2025 22:51:19 +0200
Subject: [PATCH 21/21] reading order: improve handling of headings and
 horizontal seps

- drop connected components analysis to test overlaps between
  horizontal separators and (horizontal) neighbours (introduced
  in ab17a927)
- instead of converting headings to topline and baseline during
  `find_number_of_columns_in_document` (introduced in 9f1595d7),
  add them to the matrix unchanged, but mark as extra type
  (besides horizontal and vertical separtors)
- convert headings to toplines and baselines no earlier than in
  `return_boxes_of_images_by_order_of_reading_new`
- for both headings and horizontal separators, if they already
  span multiple columns, check if they would overlap (horizontal)
  neighbours by looking at successively larger (left and right)
  intervals of columns (and pick the largest elongation which
  does not introduce any overlaps)
---
 src/eynollah/utils/__init__.py | 127 +++++++++++++++++++++------------
 1 file changed, 80 insertions(+), 47 deletions(-)

diff --git a/src/eynollah/utils/__init__.py b/src/eynollah/utils/__init__.py
index a163fad..f3dbae2 100644
--- a/src/eynollah/utils/__init__.py
+++ b/src/eynollah/utils/__init__.py
@@ -1387,8 +1387,6 @@ def return_points_with_boundies(peaks_neg_fin, first_point, last_point):
     return peaks_neg_tot
 
 def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables, label_seps, contours_h=None):
-    ncomps, ccomps = cv2.connectedComponents(region_pre_p.astype(np.uint8))
-
     separators_closeup = 1 * (region_pre_p == label_seps)
     separators_closeup[0:110] = 0
     separators_closeup[-150:] = 0
@@ -1414,14 +1412,6 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
         dist_ye = max_ye - min_ye
         if dist_ye <= 50 and dist_xe >= 3 * dist_ye:
             cnts_hor_e.append(cnt)
-            labels = np.setdiff1d(np.unique(ccomps[med_ye]), [0])
-            if len(labels) == 1:
-                # mid line does not intersect with any other region
-                # so add it as extra splitter line
-                cnts_hor_e.append(np.array([[[0, med_ye]],
-                                            [[ccomps.shape[1], med_ye]],
-                                            [[ccomps.shape[1], med_ye + 1]],
-                                            [[0, med_ye + 1]]]))
 
     # delete horizontal contours (leaving only the edges)
     separators_closeup_n_binary = cv2.fillPoly(separators_closeup_n_binary, pts=cnts_hor_e, color=0)
@@ -1493,7 +1483,7 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
     slope_seps_org_hor=slope_seps_org_hor[dist_x_hor>=len_x/2.0]
     dist_x_hor=dist_x_hor[dist_x_hor>=len_x/2.0]
 
-    matrix_of_seps_ch=np.zeros((len(cy_seps_hor)+len(cx_seps_ver),10))
+    matrix_of_seps_ch = np.zeros((len(cy_seps_hor)+len(cx_seps_ver), 10), dtype=int)
     matrix_of_seps_ch[:len(cy_seps_hor),0]=args_hor
     matrix_of_seps_ch[len(cy_seps_hor):,0]=args_ver
     matrix_of_seps_ch[len(cy_seps_hor):,1]=cx_seps_ver
@@ -1515,34 +1505,17 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
     if contours_h is not None:
         _, dist_x_head, x_min_head, x_max_head, cy_head, _, y_min_head, y_max_head, _ = \
             find_features_of_lines(contours_h)
-        # matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]))
-        # args_head = np.arange(len(cy_head))
-        # matrix_l_n[:, 0] = args_head
-        # matrix_l_n[:, 2] = x_min_head+30
-        # matrix_l_n[:, 3] = x_max_head-30
-        # matrix_l_n[:, 4] = dist_x_head
-        # matrix_l_n[:, 5] = y_min_head-3-8
-        # matrix_l_n[:, 6] = y_min_head-5-8
-        # matrix_l_n[:, 7] = y_max_head#y_min_head+1-8
-        # matrix_l_n[:, 8] = 4
-        # split at toplines (y_min_head) and baselines (y_max_head) instead of center (cy_head):
-        cy_head = np.stack((y_min_head, y_max_head)).T.flatten()
-        y_min_head, y_max_head = (np.stack((y_min_head - 2, y_max_head - 2)).T.flatten(),
-                                  np.stack((y_min_head + 2, y_max_head + 2)).T.flatten())
-        x_min_head = np.repeat(x_min_head, 2)
-        x_max_head = np.repeat(x_max_head, 2)
-        dist_x_head = np.repeat(dist_x_head, 2)
-        matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]))
+        matrix_l_n = np.zeros((len(cy_head), matrix_of_seps_ch.shape[1]), dtype=int)
         args_head = np.arange(len(cy_head))
         matrix_l_n[:, 0] = args_head
-        # +/- 30px to avoid crossing col peaks by accident
-        matrix_l_n[:, 2] = x_min_head + 30
-        matrix_l_n[:, 3] = x_max_head - 30
+        matrix_l_n[:, 2] = x_min_head
+        matrix_l_n[:, 3] = x_max_head
         matrix_l_n[:, 4] = dist_x_head
         matrix_l_n[:, 5] = cy_head
         matrix_l_n[:, 6] = y_min_head
         matrix_l_n[:, 7] = y_max_head
-        matrix_l_n[:, 8] = 4
+        matrix_l_n[:, 8] = y_max_head - y_min_head
+        matrix_l_n[:, 9] = 2 # mark as heading (so it can be split into 2 horizontal separators as needed)
         matrix_of_seps_ch = np.append(
             matrix_of_seps_ch, matrix_l_n, axis=0)
 
@@ -1551,9 +1524,12 @@ def find_number_of_columns_in_document(region_pre_p, num_col_classifier, tables,
     cy_seps_splitters = np.append(cy_seps_splitters, special_separators)
 
     if contours_h is not None:
-        cy_seps_splitters_head=cy_head[(x_min_head<=.16*region_pre_p.shape[1]) &
-                                       (x_max_head>=.84*region_pre_p.shape[1])]
-        cy_seps_splitters = np.append(cy_seps_splitters, cy_seps_splitters_head)
+        y_min_splitters_head = y_min_head[(x_min_head<=.16*region_pre_p.shape[1]) &
+                                          (x_max_head>=.84*region_pre_p.shape[1])]
+        y_max_splitters_head = y_max_head[(x_min_head<=.16*region_pre_p.shape[1]) &
+                                          (x_max_head>=.84*region_pre_p.shape[1])]
+        cy_seps_splitters = np.append(cy_seps_splitters, y_min_splitters_head)
+        cy_seps_splitters = np.append(cy_seps_splitters, y_max_splitters_head)
 
     cy_seps_splitters = np.sort(cy_seps_splitters).astype(int)
     splitter_y_new = [0] + list(cy_seps_splitters) + [region_pre_p.shape[0]]
@@ -1713,17 +1689,6 @@ def return_boxes_of_images_by_order_of_reading_new(
         #num_col, peaks_neg_fin = find_num_col(
         #    regions_without_separators[top:bot,:],
         #    multiplier=7.0)
-        x_min_hor_some=matrix_new[:,2][ (matrix_new[:,9]==0) ]
-        x_max_hor_some=matrix_new[:,3][ (matrix_new[:,9]==0) ]
-        cy_hor_some=matrix_new[:,5][ (matrix_new[:,9]==0) ]
-        y_max_hor_some=matrix_new[:,7][ (matrix_new[:,9]==0) ]
-
-        if right2left_readingorder:
-            x_max_hor_some_new = width_tot - x_min_hor_some
-            x_min_hor_some_new = width_tot - x_max_hor_some
-            x_min_hor_some =list(np.copy(x_min_hor_some_new))
-            x_max_hor_some =list(np.copy(x_max_hor_some_new))
-
         peaks_neg_tot = np.array([0] + peaks_neg_fin + [width_tot])
         #print(peaks_neg_tot,'peaks_neg_tot')
         peaks_neg_tot_tables.append(peaks_neg_tot)
@@ -1731,6 +1696,74 @@ def return_boxes_of_images_by_order_of_reading_new(
         all_columns = set(range(len(peaks_neg_tot) - 1))
         #print("all_columns", all_columns)
 
+        # elongate horizontal separators+headings as much as possible without overlap
+        args_nonver = matrix_new[:, 9] != 1
+        regions_with_separators = np.copy(regions_without_separators[top:bot])
+        for xmin, xmax, ymin, ymax in matrix_new[:, [2, 3, 6, 7]]:
+            regions_with_separators[ymin - top: ymax - top, xmin: xmax] = 6
+        # def dbg_imshow(box, title):
+        #     xmin, xmax, ymin, ymax = box
+        #     plt.imshow(regions_with_separators, extent=[0, width_tot, bot, top])
+        #     plt.gca().add_patch(patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
+        #                                           fill=False, linewidth=1, edgecolor='r'))
+        #     plt.title(title + " at %d:%d, %d:%d" % (ymin, ymax, xmin, xmax))
+        #     plt.show()
+        for i in np.flatnonzero(args_nonver):
+            xmin, xmax, ymin, ymax, typ = matrix_new[i, [2, 3, 6, 7, 9]]
+            cut = regions_with_separators[ymin - top: ymax - top]
+            # dbg_imshow([xmin, xmax, ymin, ymax], "separator %d (%s)" % (i, "heading" if typ else "horizontal"))
+            starting = xmin - peaks_neg_tot
+            min_start = np.flatnonzero(starting >= 0)[-1] # last left-of
+            ending = xmax - peaks_neg_tot
+            max_end = np.flatnonzero(ending < 0)[0] # first right-of
+            # skip elongation unless this is already a multi-column separator/heading:
+            if not max_end - min_start > 1:
+                continue
+            # is there anything left of min_start?
+            for j in range(min_start):
+                # dbg_imshow([peaks_neg_tot[j], xmin, ymin, ymax], "start of %d candidate %d" % (i, j))
+                if not np.any(cut[:, peaks_neg_tot[j]: xmin]):
+                    # print("elongated sep", i, "typ", typ, "start", xmin, "to", j, peaks_neg_tot[j])
+                    matrix_new[i, 2] = peaks_neg_tot[j] + 1 # elongate to start of this column
+                    break
+            # is there anything right of max_end?
+            for j in range(len(peaks_neg_tot) - 1, max_end, -1):
+                # dbg_imshow([xmax, peaks_neg_tot[j], ymin, ymax], "end of %d candidate %d" % (i, j))
+                if not np.any(cut[:, xmax: peaks_neg_tot[j]]):
+                    # print("elongated sep", i, "typ", typ, "end", xmax, "to", j, peaks_neg_tot[j])
+                    matrix_new[i, 3] = peaks_neg_tot[j] - 1 # elongate to end of this column
+                    break
+
+        args_hor = matrix_new[:, 9] == 0
+        x_min_hor_some = matrix_new[:, 2][args_hor]
+        x_max_hor_some = matrix_new[:, 3][args_hor]
+        y_max_hor_some = matrix_new[:, 7][args_hor]
+        cy_hor_some = matrix_new[:, 5][args_hor]
+
+        args_head = matrix_new[:, 9] == 2
+        x_min_hor_head = matrix_new[:, 2][args_head]
+        x_max_hor_head = matrix_new[:, 3][args_head]
+        y_min_hor_head = matrix_new[:, 6][args_head]
+        y_max_hor_head = matrix_new[:, 7][args_head]
+        cy_hor_head = matrix_new[:, 5][args_head]
+
+        # split headings at toplines (y_min_head) and baselines (y_max_head)
+        # instead of merely adding their center (cy_head) as horizontal separator
+        # (x +/- 30px to avoid crossing col peaks by accident)
+        x_min_hor_some = np.append(x_min_hor_some, np.tile(x_min_hor_head + 30, 2))
+        x_max_hor_some = np.append(x_max_hor_some, np.tile(x_max_hor_head - 30, 2))
+        y_max_hor_some = np.append(y_max_hor_some, # baselines
+                                   np.concatenate((y_min_hor_head + 2,
+                                                   y_max_hor_head + 2)))
+        cy_hor_some = np.append(cy_hor_some, # toplines
+                                np.concatenate((y_min_hor_head - 2,
+                                                y_max_hor_head - 2)))
+
+        if right2left_readingorder:
+            x_max_hor_some = width_tot - x_min_hor_some
+            x_min_hor_some = width_tot - x_max_hor_some
+
+
         reading_order_type, x_starting, x_ending, y_mid, y_max, \
             y_mid_without_mother, x_start_without_mother, x_end_without_mother, \
             there_is_sep_with_child, \