Description
Bug
https://www.mdpi.com/1099-4300/21/1/3
I try to convert the pdf of this paper into md. But docling output <!-- formula-not-decoded -->
in normal mode. When I added the --enrich-formula
flag, it gives the following error:
docling --enrich-formula entropy-1.pdf
Traceback (most recent call last):
File "/Users/bwen/.pyenv/versions/3.11.1/bin/docling", line 8, in
sys.exit(app())
^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/typer/main.py", line 338, in call
raise e
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/typer/main.py", line 321, in call
return get_command(self)(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/click/core.py", line 1157, in call
return self.main(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/typer/core.py", line 665, in main
return _main(
^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/typer/core.py", line 197, in _main
rv = self.invoke(ctx)
^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/typer/main.py", line 703, in wrapper
return callback(**use_params)
^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/cli/main.py", line 593, in convert
export_documents(
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/cli/main.py", line 167, in export_documents
for conv_res in conv_results:
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/document_converter.py", line 243, in convert_all
for conv_res in conv_res_iter:
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/document_converter.py", line 278, in _convert
for item in map(
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/document_converter.py", line 324, in _process_document
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/document_converter.py", line 345, in _execute_pipeline
pipeline = self._get_pipeline(in_doc.format)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/document_converter.py", line 307, in _get_pipeline
self.initialized_pipelines[cache_key] = pipeline_class(
^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/pipeline/standard_pdf_pipeline.py", line 106, in init
CodeFormulaModel(
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling/models/code_formula_model.py", line 107, in init
self.code_formula_model = CodeFormulaPredictor(
^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/docling_ibm_models/code_formula_model/code_formula_predictor.py", line 83, in init
self._tokenizer = AutoTokenizer.from_pretrained(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 889, in from_pretrained
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2163, in from_pretrained
return cls._from_pretrained(
^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2397, in _from_pretrained
tokenizer = cls(*init_inputs, **init_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py", line 99, in init
super().init(
File "/Users/bwen/.pyenv/versions/3.11.1/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 115, in init
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Exception: data did not match any variant of untagged enum ModelWrapper at line 255422 column 3
Steps to reproduce
run docling --enrich-formula entropy-1.pdf
Docling version
Docling version: 2.30.0
Docling Core version: 2.27.0
Docling IBM Models version: 3.4.1
Docling Parse version: 4.0.1
Python: cpython-311 (3.11.1)
Platform: macOS-15.5-x86_64-i386-64bit