Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,23 @@ EXTRAS ?=
DOCKER_BASE_IMAGE ?= docker.io/ocrd/core-cuda-tf2:latest
DOCKER_TAG ?= ocrd/eynollah
DOCKER ?= docker
WGET = wget -O

#SEG_MODEL := https://qurator-data.de/eynollah/2021-04-25/models_eynollah.tar.gz
#SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah_renamed.tar.gz
# SEG_MODEL := https://qurator-data.de/eynollah/2022-04-05/models_eynollah.tar.gz
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
#SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1
#SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1
SEG_MODEL := https://zenodo.org/records/17295988/files/models_layout_v0_6_0.tar.gz?download=1
SEG_MODELFILE = $(notdir $(patsubst %?download=1,%,$(SEG_MODEL)))
SEG_MODELNAME = $(SEG_MODELFILE:%.tar.gz=%)

BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip
BIN_MODEL := https://zenodo.org/records/17295988/files/models_binarization_v0_6_0.tar.gz?download=1
BIN_MODELFILE = $(notdir $(BIN_MODEL))
BIN_MODELNAME := default-2021-03-09

OCR_MODEL := https://zenodo.org/records/17236998/files/models_ocr_v0_5_1.tar.gz?download=1
OCR_MODEL := https://zenodo.org/records/17295988/files/models_ocr_v0_6_0.tar.gz?download=1
OCR_MODELFILE = $(notdir $(patsubst %?download=1,%,$(OCR_MODEL)))
OCR_MODELNAME = $(OCR_MODELFILE:%.tar.gz=%)

Expand Down Expand Up @@ -55,22 +57,21 @@ help:
# END-EVAL


# Download and extract models to $(PWD)/models_layout_v0_5_0
# Download and extract models to $(PWD)/models_layout_v0_6_0
models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)

# do not download these files if we already have the directories
.INTERMEDIATE: $(BIN_MODELFILE) $(SEG_MODELFILE) $(OCR_MODELFILE)

$(BIN_MODELFILE):
wget -O $@ $(BIN_MODEL)
$(WGET) $@ $(BIN_MODEL)
$(SEG_MODELFILE):
wget -O $@ $(SEG_MODEL)
$(WGET) $@ $(SEG_MODEL)
$(OCR_MODELFILE):
wget -O $@ $(OCR_MODEL)
$(WGET) $@ $(OCR_MODEL)

$(BIN_MODELNAME): $(BIN_MODELFILE)
mkdir $@
unzip -d $@ $<
tar zxf $<
$(SEG_MODELNAME): $(SEG_MODELFILE)
tar zxf $<
$(OCR_MODELNAME): $(OCR_MODELFILE)
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ make install EXTRAS=OCR

## Models

Pretrained models can be downloaded from [zenodo](https://zenodo.org/records/17194824) or [huggingface](https://huggingface.co/SBB?search_models=eynollah).
Pretrained models can be downloaded from [zenodo](https://doi.org/10.5281/zenodo.17194823) or [huggingface](https://huggingface.co/SBB?search_models=eynollah).

For documentation on models, have a look at [`models.md`](https://github.com/qurator-spk/eynollah/tree/main/docs/models.md).
Model cards are also provided for our trained models.
Expand Down Expand Up @@ -162,7 +162,7 @@ formally described in [`ocrd-tool.json`](https://github.com/qurator-spk/eynollah

In this case, the source image file group with (preferably) RGB images should be used as input like this:

ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models eynollah_layout_v0_5_0
ocrd-eynollah-segment -I OCR-D-IMG -O OCR-D-SEG -P models eynollah_layout_v0_6_0

If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynollah behaves as follows:
- existing regions are kept and ignored (i.e. in effect they might overlap segments from Eynollah results)
Expand All @@ -174,7 +174,7 @@ If the input file group is PAGE-XML (from a previous OCR-D workflow step), Eynol
(because some other preprocessing step was in effect like `denoised`), then
the output PAGE-XML will be based on that as new top-level (`@imageFilename`)

ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models eynollah_layout_v0_5_0
ocrd-eynollah-segment -I OCR-D-XYZ -O OCR-D-SEG -P models eynollah_layout_v0_6_0

In general, it makes more sense to add other workflow steps **after** Eynollah.

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ tensorflow < 2.13
numba <= 0.58.1
scikit-image
biopython
tabulate
84 changes: 75 additions & 9 deletions src/eynollah/cli.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
from dataclasses import dataclass
import sys
import click
import logging
from typing import Tuple, List
from ocrd_utils import initLogging, getLevelName, getLogger
from eynollah.eynollah import Eynollah, Eynollah_ocr
from eynollah.eynollah import Eynollah
from eynollah.eynollah_ocr import Eynollah_ocr
from eynollah.sbb_binarize import SbbBinarizer
from eynollah.image_enhancer import Enhancer
from eynollah.mb_ro_on_layout import machine_based_reading_order_on_layout
from eynollah.model_zoo import EynollahModelZoo

from .cli_models import models_cli

@click.group()
def main():
pass

main.add_command(models_cli, 'models')

@main.command()
@click.option(
"--input",
Expand Down Expand Up @@ -79,18 +87,38 @@ def machine_based_reading_order(input, dir_in, out, model, log_level):
type=click.Path(file_okay=True, dir_okay=True),
required=True,
)
@click.option(
'-M',
'--mode',
type=click.Choice(['single', 'multi']),
default='single',
help="Whether to use the (faster) single-model binarization or the (slightly better) multi-model binarization"
)
@click.option(
"--log_level",
"-l",
type=click.Choice(['OFF', 'DEBUG', 'INFO', 'WARN', 'ERROR']),
help="Override log level globally to this",
)
def binarization(patches, model_dir, input_image, dir_in, output, log_level):
def binarization(
patches,
model_dir,
input_image,
mode,
dir_in,
output,
log_level,
):
assert bool(input_image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
binarizer = SbbBinarizer(model_dir)
binarizer = SbbBinarizer(model_dir, mode=mode)
if log_level:
binarizer.log.setLevel(getLevelName(log_level))
binarizer.run(image_path=input_image, use_patches=patches, output=output, dir_in=dir_in)
binarizer.run(
image_path=input_image,
use_patches=patches,
output=output,
dir_in=dir_in
)


@main.command()
Expand Down Expand Up @@ -198,15 +226,17 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low
@click.option(
"--model",
"-m",
'model_basedir',
help="directory of models",
type=click.Path(exists=True, file_okay=False),
# default=f"{os.environ['HOME']}/.local/share/ocrd-resources/ocrd-eynollah-segment",
required=True,
)
@click.option(
"--model_version",
"-mv",
help="override default versions of model categories",
type=(str, str),
help="override default versions of model categories, syntax is 'CATEGORY VARIANT PATH', e.g 'region light /path/to/model'. See eynollah list-models for the full list",
type=(str, str, str),
multiple=True,
)
@click.option(
Expand Down Expand Up @@ -380,7 +410,43 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low
help="Setup a basic console logger",
)

def layout(image, out, overwrite, dir_in, model, model_version, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level, setup_logging):
def layout(
image,
out,
overwrite,
dir_in,
model_basedir,
model_version,
save_images,
save_layout,
save_deskewed,
save_all,
extract_only_images,
save_page,
enable_plotting,
allow_enhancement,
curved_line,
textline_light,
full_layout,
tables,
right2left,
input_binary,
allow_scaling,
headers_off,
light_version,
reading_order_machine_based,
do_ocr,
transformer_ocr,
batch_size_ocr,
num_col_upper,
num_col_lower,
threshold_art_class_textline,
threshold_art_class_layout,
skip_layout_and_reading_order,
ignore_page_extraction,
log_level,
setup_logging,
):
if setup_logging:
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
Expand Down Expand Up @@ -410,8 +476,8 @@ def layout(image, out, overwrite, dir_in, model, model_version, save_images, sav
assert not extract_only_images or not headers_off, "Image extraction -eoi can not be set alongside headers_off -ho"
assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
eynollah = Eynollah(
model,
model_versions=model_version,
model_basedir,
model_overrides=model_version,
extract_only_images=extract_only_images,
enable_plotting=enable_plotting,
allow_enhancement=allow_enhancement,
Expand Down
93 changes: 93 additions & 0 deletions src/eynollah/cli_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from dataclasses import dataclass
from pathlib import Path
from typing import List, Set, Tuple
import click

from eynollah.model_zoo.default_specs import MODELS_VERSION
from .model_zoo import EynollahModelZoo


@dataclass()
class EynollahCliCtx:
model_zoo: EynollahModelZoo


@click.group()
@click.pass_context
@click.option(
"--model",
"-m",
'model_basedir',
help="directory of models",
type=click.Path(exists=True, file_okay=False),
# default=f"{os.environ['HOME']}/.local/share/ocrd-resources/ocrd-eynollah-segment",
required=True,
)
@click.option(
"--model-overrides",
"-mv",
help="override default versions of model categories, syntax is 'CATEGORY VARIANT PATH', e.g 'region light /path/to/model'. See eynollah list-models for the full list",
type=(str, str, str),
multiple=True,
)
def models_cli(
ctx,
model_basedir: str,
model_overrides: List[Tuple[str, str, str]],
):
"""
Organize models for the various runners in eynollah.
"""
ctx.obj = EynollahCliCtx(model_zoo=EynollahModelZoo(basedir=model_basedir, model_overrides=model_overrides))


@models_cli.command('list')
@click.pass_context
def list_models(
ctx,
):
"""
List all the models in the zoo
"""
print(ctx.obj.model_zoo)


@models_cli.command('package')
@click.option(
'--set-version', '-V', 'version', help="Version to use for packaging", default=MODELS_VERSION, show_default=True
)
@click.argument('output_dir')
@click.pass_context
def package(
ctx,
version,
output_dir,
):
"""
Generate shell code to copy all the models in the zoo into properly named folders in OUTPUT_DIR for distribution.

eynollah models -m SRC package OUTPUT_DIR

SRC should contain a directory "models_eynollah" containing all the models.
"""
mkdirs: Set[Path] = set([])
copies: Set[Tuple[Path, Path]] = set([])
for spec in ctx.obj.model_zoo.specs.specs:
# skip these as they are dependent on the ocr model
if spec.category in ('num_to_char', 'characters'):
continue
src: Path = ctx.obj.model_zoo.model_path(spec.category, spec.variant)
# Only copy the top-most directory relative to models_eynollah
while src.parent.name != 'models_eynollah':
src = src.parent
for dist in spec.dists:
dist_dir = Path(f"{output_dir}/models_{dist}_{version}/models_eynollah")
copies.add((src, dist_dir))
mkdirs.add(dist_dir)
for dir in mkdirs:
print(f"mkdir -p {dir}")
for (src, dst) in copies:
print(f"cp -r {src} {dst}")
for dir in mkdirs:
zip_path = Path(f'../{dir.parent.name}.zip')
print(f"(cd {dir}/..; zip -r {zip_path} models_eynollah)")
Loading
Loading