Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
086c188
binarization: add option `--overwrite`, skip existing outputs
Oct 15, 2025
184927f
`find_num_cols`: re-sort peaks when cutting n-best `num_col_classifier`
Oct 20, 2025
48761c3
`find_num_col`: simplify, add better plotting (but commented out)
Oct 20, 2025
c43a825
`order_of_regions`: filter out-of-image peaks
Oct 20, 2025
d3d599b
`order_of_regions`: add better plotting (but commented out)
Oct 20, 2025
542d38a
`find_number_of_columns_in_document`: simplify, rename `line`→`seps`
Oct 20, 2025
5a0e4c3
`find_number_of_columns_in_document`: improve splitter rule
Oct 20, 2025
cd35241
`find_number_of_columns_in_document`: split headings at top+baseline
Oct 20, 2025
7c3e418
`return_boxes_of_images_by_order_of_reading_new`: simplify
Oct 20, 2025
0fc4b25
`return_boxes_of_images_by_order_of_reading_new`: fix no-mother case
Oct 20, 2025
e2dfec7
`return_x_start_end_mothers_childs_and_type_of_reading_order`:
Oct 23, 2025
b2a79cc
`return_x_start_end_mothers_childs_and_type_of_reading_order`: fix+1
Oct 23, 2025
acee4c1
`find_number_of_columns_in_document`: simplify
Oct 23, 2025
5d15941
`contours_in_same_horizon`: simplify
Oct 23, 2025
6cc5900
`find_num_col`: add better plotting (but commented out)
Oct 23, 2025
6fbb5f8
`return_boxes_of_images_by_order_of_reading_new`: simplify
Oct 24, 2025
66a0e55
`return_boxes_of_images_by_order_of_reading_new`: avoid oversplits
Oct 24, 2025
3ebbc2d
`return_boxes_of_images_by_order_of_reading_new`: indent
Oct 24, 2025
a2a9fe5
`delete_separator_around`: simplify, eynollah: identifiers
Oct 24, 2025
3367462
`return_boxes_of_images_by_order_of_reading_new`: change arg order
Oct 24, 2025
19b2c3f
reading order: improve handling of headings and horizontal seps
Oct 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions src/eynollah/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,28 @@ def machine_based_reading_order(input, dir_in, out, model, log_level):
type=click.Path(file_okay=True, dir_okay=True),
required=True,
)
@click.option(
"--overwrite",
"-O",
help="overwrite (instead of skipping) if output xml exists",
is_flag=True,
)
@click.option(
"--log_level",
"-l",
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']),
help="Override log level globally to this",
)
def binarization(patches, model_dir, input_image, dir_in, output, log_level):
def binarization(patches, model_dir, input_image, dir_in, output, overwrite, log_level):
assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
binarizer = SbbBinarizer(model_dir)
if log_level:
binarizer.log.setLevel(getLevelName(log_level))
binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in)
binarizer.logger.setLevel(getLevelName(log_level))
binarizer.run(overwrite=overwrite,
use_patches=patches,
image_path=input_image,
output=output,
dir_in=dir_in)


@main.command()
Expand Down
116 changes: 55 additions & 61 deletions src/eynollah/eynollah.py
Original file line number Diff line number Diff line change
Expand Up @@ -2507,13 +2507,15 @@ def match_boxes(only_centers: bool):
My_main[ii] < box[3])):
arg_text_con_main[ii] = jj
check_if_textregion_located_in_a_box = True
#print("main/matched", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", box, only_centers)
break
if not check_if_textregion_located_in_a_box:
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_main[ii]], [cx_main[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_main[ii]) & (cy_main[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_main[ii]) & (cx_main[ii] < boxes[:, 1]))
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con_main[ii] = ind_min
#print("main/fallback", (mx_main[ii], Mx_main[ii], my_main[ii], My_main[ii]), "\tin", boxes[ind_min], only_centers)
args_contours_main = np.arange(len(contours_only_text_parent))
order_by_con_main = np.zeros_like(arg_text_con_main)

Expand All @@ -2531,13 +2533,15 @@ def match_boxes(only_centers: bool):
My_head[ii] < box[3])):
arg_text_con_head[ii] = jj
check_if_textregion_located_in_a_box = True
#print("head/matched", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", box, only_centers)
break
if not check_if_textregion_located_in_a_box:
dists_tr_from_box = np.linalg.norm(c_boxes - np.array([[cy_head[ii]], [cx_head[ii]]]), axis=0)
pcontained_in_box = ((boxes[:, 2] <= cy_head[ii]) & (cy_head[ii] < boxes[:, 3]) &
(boxes[:, 0] <= cx_head[ii]) & (cx_head[ii] < boxes[:, 1]))
ind_min = np.argmin(np.ma.masked_array(dists_tr_from_box, ~pcontained_in_box))
arg_text_con_head[ii] = ind_min
#print("head/fallback", (mx_head[ii], Mx_head[ii], my_head[ii], My_head[ii]), "\tin", boxes[ind_min], only_centers)
args_contours_head = np.arange(len(contours_only_text_parent_h))
order_by_con_head = np.zeros_like(arg_text_con_head)

Expand All @@ -2553,7 +2557,7 @@ def match_boxes(only_centers: bool):
con_inter_box_h = contours_only_text_parent_h[args_contours_box_head]

indexes_sorted, kind_of_texts_sorted, index_by_kind_sorted = order_of_regions(
textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2])
textline_mask_tot[ys, xs], con_inter_box, con_inter_box_h, box[2], box[0])

order_of_texts, id_of_texts = order_and_id_of_texts(
con_inter_box, con_inter_box_h,
Expand Down Expand Up @@ -2587,7 +2591,7 @@ def match_boxes(only_centers: bool):
try:
results = match_boxes(False)
except Exception as why:
self.logger.error(why)
self.logger.exception(why)
results = match_boxes(True)

self.logger.debug("exit do_order_of_regions")
Expand Down Expand Up @@ -2665,45 +2669,35 @@ def check_iou_of_bounding_box_and_contour_for_tables(

return layout_org, contours_new

def delete_separator_around(self, spliter_y,peaks_neg,image_by_region, pixel_line, pixel_table):
def delete_separator_around(self, splitter_y, peaks_neg, image_by_region, label_seps, label_table):
# format of subboxes: box=[x1, x2 , y1, y2]
pix_del = 100
if len(image_by_region.shape)==3:
for i in range(len(spliter_y)-1):
for j in range(1,len(peaks_neg[i])-1):
ys = slice(int(spliter_y[i]),
int(spliter_y[i+1]))
xs = slice(peaks_neg[i][j] - pix_del,
peaks_neg[i][j] + pix_del)
image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_line] = 0
image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_line] = 0
image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_line] = 0

image_by_region[ys,xs,0][image_by_region[ys,xs,0]==pixel_table] = 0
image_by_region[ys,xs,0][image_by_region[ys,xs,1]==pixel_table] = 0
image_by_region[ys,xs,0][image_by_region[ys,xs,2]==pixel_table] = 0
else:
for i in range(len(spliter_y)-1):
for j in range(1,len(peaks_neg[i])-1):
ys = slice(int(spliter_y[i]),
int(spliter_y[i+1]))
xs = slice(peaks_neg[i][j] - pix_del,
peaks_neg[i][j] + pix_del)
image_by_region[ys,xs][image_by_region[ys,xs]==pixel_line] = 0
image_by_region[ys,xs][image_by_region[ys,xs]==pixel_table] = 0
for i in range(len(splitter_y)-1):
for j in range(1,len(peaks_neg[i])-1):
where = np.index_exp[splitter_y[i]:
splitter_y[i+1],
peaks_neg[i][j] - pix_del:
peaks_neg[i][j] + pix_del,
:]
if image_by_region.ndim < 3:
where = where[:2]
else:
print("image_by_region ndim is 3!") # rs
image_by_region[where][image_by_region[where] == label_seps] = 0
image_by_region[where][image_by_region[where] == label_table] = 0
return image_by_region

def add_tables_heuristic_to_layout(
self, image_regions_eraly_p, boxes,
slope_mean_hor, spliter_y, peaks_neg_tot, image_revised,
num_col_classifier, min_area, pixel_line):
slope_mean_hor, splitter_y, peaks_neg_tot, image_revised,
num_col_classifier, min_area, label_seps):

pixel_table =10
image_revised_1 = self.delete_separator_around(spliter_y, peaks_neg_tot, image_revised, pixel_line, pixel_table)
label_table =10
image_revised_1 = self.delete_separator_around(splitter_y, peaks_neg_tot, image_revised, label_seps, label_table)

try:
image_revised_1[:,:30][image_revised_1[:,:30]==pixel_line] = 0
image_revised_1[:,-30:][image_revised_1[:,-30:]==pixel_line] = 0
image_revised_1[:,:30][image_revised_1[:,:30]==label_seps] = 0
image_revised_1[:,-30:][image_revised_1[:,-30:]==label_seps] = 0
except:
pass
boxes = np.array(boxes, dtype=int) # to be on the safe side
Expand All @@ -2714,7 +2708,7 @@ def add_tables_heuristic_to_layout(
_, thresh = cv2.threshold(image_col, 0, 255, 0)
contours,hirarchy=cv2.findContours(thresh.copy(), cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)

if indiv==pixel_table:
if indiv==label_table:
main_contours = filter_contours_area_of_image_tables(thresh, contours, hirarchy,
max_area=1, min_area=0.001)
else:
Expand All @@ -2730,11 +2724,11 @@ def add_tables_heuristic_to_layout(
box_xs = slice(*boxes[i][0:2])
image_box = img_comm[box_ys, box_xs]
try:
image_box_tabels_1 = (image_box == pixel_table) * 1
image_box_tabels_1 = (image_box == label_table) * 1
contours_tab,_=return_contours_of_image(image_box_tabels_1)
contours_tab=filter_contours_area_of_image_tables(image_box_tabels_1,contours_tab,_,1,0.003)
image_box_tabels_1 = (image_box == pixel_line).astype(np.uint8) * 1
image_box_tabels_and_m_text = ( (image_box == pixel_table) |
image_box_tabels_1 = (image_box == label_seps).astype(np.uint8) * 1
image_box_tabels_and_m_text = ( (image_box == label_table) |
(image_box == 1) ).astype(np.uint8) * 1

image_box_tabels_1 = cv2.dilate(image_box_tabels_1, KERNEL, iterations=5)
Expand Down Expand Up @@ -2796,7 +2790,7 @@ def add_tables_heuristic_to_layout(
y_up_tabs=[]

for ii in range(len(y_up_tabs)):
image_box[y_up_tabs[ii]:y_down_tabs[ii]] = pixel_table
image_box[y_up_tabs[ii]:y_down_tabs[ii]] = label_table

image_revised_last[box_ys, box_xs] = image_box
else:
Expand All @@ -2807,14 +2801,14 @@ def add_tables_heuristic_to_layout(
image_revised_last[box_ys, box_xs] = image_box

if num_col_classifier==1:
img_tables_col_1 = (image_revised_last == pixel_table).astype(np.uint8)
img_tables_col_1 = (image_revised_last == label_table).astype(np.uint8)
contours_table_col1, _ = return_contours_of_image(img_tables_col_1)

_,_ ,_ , _, y_min_tab_col1 ,y_max_tab_col1, _= find_new_features_of_contours(contours_table_col1)

if len(y_min_tab_col1)>0:
for ijv in range(len(y_min_tab_col1)):
image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = pixel_table
image_revised_last[int(y_min_tab_col1[ijv]):int(y_max_tab_col1[ijv])] = label_table
return image_revised_last

def get_tables_from_model(self, img, num_col_classifier):
Expand Down Expand Up @@ -2976,7 +2970,7 @@ def run_graphics_and_columns_light(
max(self.num_col_lower or num_col_classifier,
num_col_classifier))
except Exception as why:
self.logger.error(why)
self.logger.exception(why)
num_col = None
#print("inside graphics 3 ", time.time() - t_in_gr)
return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines,
Expand Down Expand Up @@ -3044,7 +3038,7 @@ def run_graphics_and_columns(
if not num_column_is_classified:
num_col_classifier = num_col + 1
except Exception as why:
self.logger.error(why)
self.logger.exception(why)
num_col = None
return (num_col, num_col_classifier, img_only_regions, page_coord, image_page, mask_images, mask_lines,
text_regions_p_1, cont_page, table_prediction)
Expand Down Expand Up @@ -3149,14 +3143,14 @@ def run_boxes_no_full_layout(
text_regions_p_1_n = None
textline_mask_tot_d = None
regions_without_separators_d = None
pixel_lines = 3
label_seps = 3
if np.abs(slope_deskew) < SLOPE_THRESHOLD:
_, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document(
text_regions_p, num_col_classifier, self.tables, pixel_lines)
_, _, matrix_of_seps_ch, splitter_y_new, _ = find_number_of_columns_in_document(
text_regions_p, num_col_classifier, self.tables, label_seps)

if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
_, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines)
_, _, matrix_of_seps_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
text_regions_p_1_n, num_col_classifier, self.tables, label_seps)
#print(time.time()-t_0_box,'time box in 2')
self.logger.info("num_col_classifier: %s", num_col_classifier)

Expand All @@ -3171,7 +3165,7 @@ def run_boxes_no_full_layout(
t1 = time.time()
if np.abs(slope_deskew) < SLOPE_THRESHOLD:
boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(
splitter_y_new, regions_without_separators, matrix_of_lines_ch,
splitter_y_new, regions_without_separators, matrix_of_seps_ch,
num_col_classifier, erosion_hurts, self.tables, self.right2left)
boxes_d = None
self.logger.debug("len(boxes): %s", len(boxes))
Expand All @@ -3183,17 +3177,17 @@ def run_boxes_no_full_layout(
else:
text_regions_p_tables = np.copy(text_regions_p)
text_regions_p_tables[(table_prediction == 1)] = 10
pixel_line = 3
label_seps = 3
img_revised_tab2 = self.add_tables_heuristic_to_layout(
text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables,
num_col_classifier , 0.000005, pixel_line)
num_col_classifier , 0.000005, label_seps)
#print(time.time()-t_0_box,'time box in 3.2')
img_revised_tab2, contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables(
img_revised_tab2, table_prediction, 10, num_col_classifier)
#print(time.time()-t_0_box,'time box in 3.3')
else:
boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(
splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d,
splitter_y_new_d, regions_without_separators_d, matrix_of_seps_ch_d,
num_col_classifier, erosion_hurts, self.tables, self.right2left)
boxes = None
self.logger.debug("len(boxes): %s", len(boxes_d))
Expand All @@ -3206,11 +3200,11 @@ def run_boxes_no_full_layout(
text_regions_p_tables = np.round(text_regions_p_tables)
text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10

pixel_line = 3
label_seps = 3
img_revised_tab2 = self.add_tables_heuristic_to_layout(
text_regions_p_tables, boxes_d, 0, splitter_y_new_d,
peaks_neg_tot_tables_d, text_regions_p_tables,
num_col_classifier, 0.000005, pixel_line)
num_col_classifier, 0.000005, label_seps)
img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables(
img_revised_tab2, table_prediction_n, 10, num_col_classifier)

Expand Down Expand Up @@ -3329,14 +3323,14 @@ def run_boxes_full_layout(
regions_without_separators = (text_regions_p[:,:] == 1)*1
regions_without_separators[table_prediction == 1] = 1

pixel_lines=3
label_seps=3
if np.abs(slope_deskew) < SLOPE_THRESHOLD:
num_col, _, matrix_of_lines_ch, splitter_y_new, _ = find_number_of_columns_in_document(
text_regions_p, num_col_classifier, self.tables, pixel_lines)
text_regions_p, num_col_classifier, self.tables, label_seps)

if np.abs(slope_deskew) >= SLOPE_THRESHOLD:
num_col_d, _, matrix_of_lines_ch_d, splitter_y_new_d, _ = find_number_of_columns_in_document(
text_regions_p_1_n, num_col_classifier, self.tables, pixel_lines)
text_regions_p_1_n, num_col_classifier, self.tables, label_seps)

if num_col_classifier>=3:
if np.abs(slope_deskew) < SLOPE_THRESHOLD:
Expand All @@ -3355,10 +3349,10 @@ def run_boxes_full_layout(
num_col_classifier, erosion_hurts, self.tables, self.right2left)
text_regions_p_tables = np.copy(text_regions_p)
text_regions_p_tables[:,:][(table_prediction[:,:]==1)] = 10
pixel_line = 3
label_seps = 3
img_revised_tab2 = self.add_tables_heuristic_to_layout(
text_regions_p_tables, boxes, 0, splitter_y_new, peaks_neg_tot_tables, text_regions_p_tables,
num_col_classifier , 0.000005, pixel_line)
num_col_classifier , 0.000005, label_seps)

img_revised_tab2,contoures_tables = self.check_iou_of_bounding_box_and_contour_for_tables(
img_revised_tab2, table_prediction, 10, num_col_classifier)
Expand All @@ -3370,11 +3364,11 @@ def run_boxes_full_layout(
text_regions_p_tables = np.round(text_regions_p_tables)
text_regions_p_tables[(text_regions_p_tables != 3) & (table_prediction_n == 1)] = 10

pixel_line = 3
label_seps = 3
img_revised_tab2 = self.add_tables_heuristic_to_layout(
text_regions_p_tables, boxes_d, 0, splitter_y_new_d,
peaks_neg_tot_tables_d, text_regions_p_tables,
num_col_classifier, 0.000005, pixel_line)
num_col_classifier, 0.000005, label_seps)

img_revised_tab2_d,_ = self.check_iou_of_bounding_box_and_contour_for_tables(
img_revised_tab2, table_prediction_n, 10, num_col_classifier)
Expand Down Expand Up @@ -4717,12 +4711,12 @@ def deskew(polygon):
regions_without_separators_d = cv2.erode(regions_without_separators_d[:, :], KERNEL, iterations=6)

if np.abs(slope_deskew) < SLOPE_THRESHOLD:
boxes, peaks_neg_tot_tables = return_boxes_of_images_by_order_of_reading_new(
boxes, _ = return_boxes_of_images_by_order_of_reading_new(
splitter_y_new, regions_without_separators, matrix_of_lines_ch,
num_col_classifier, erosion_hurts, self.tables, self.right2left,
logger=self.logger)
else:
boxes_d, peaks_neg_tot_tables_d = return_boxes_of_images_by_order_of_reading_new(
boxes_d, _ = return_boxes_of_images_by_order_of_reading_new(
splitter_y_new_d, regions_without_separators_d, matrix_of_lines_ch_d,
num_col_classifier, erosion_hurts, self.tables, self.right2left,
logger=self.logger)
Expand Down
Loading