diff --git a/notebooks/ko/_toctree.yml b/notebooks/ko/_toctree.yml index d420590c..ef7095f5 100644 --- a/notebooks/ko/_toctree.yml +++ b/notebooks/ko/_toctree.yml @@ -24,4 +24,6 @@ isExpanded: false sections: - local: faiss_with_hf_datasets_and_clip - title: 유사성 검색을 위한 멀티모달 데이터 임베딩 \ No newline at end of file + title: 유사성 검색을 위한 멀티모달 데이터 임베딩 + - local: structured_generation_vision_language_models + title: VLM을 사용하여 이미지와 문서로부터 구조화된 데이터 생성 \ No newline at end of file diff --git a/notebooks/ko/index.md b/notebooks/ko/index.md index c68982d8..e3418b5b 100644 --- a/notebooks/ko/index.md +++ b/notebooks/ko/index.md @@ -13,6 +13,8 @@ - [다중 에이전트 계층 구조에서 여러 에이전트가 협업하도록 하기](multiagent_web_assistant) - [유사성 검색을 위한 멀티모달 데이터 임베딩](faiss_with_hf_datasets_and_clip) +- [VLM을 사용하여 이미지와 문서로부터 구조화된 데이터 생성](structured_generation_vision_language_models) + 더 다양한 노트북을 확인하고 싶다면 Cookbook's [GitHub 리포지토리](https://github.com/huggingface/cookbook)에 방문해보세요. diff --git a/notebooks/ko/structured_generation_vision_language_models.ipynb b/notebooks/ko/structured_generation_vision_language_models.ipynb new file mode 100644 index 00000000..8295b504 --- /dev/null +++ b/notebooks/ko/structured_generation_vision_language_models.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"id":"jbiwcsYOU-8-"},"source":["# VLM을 사용하여 이미지와 문서로부터 구조화된 데이터 생성하기\n","\n","HuggingFaceTB의 SmolVLM-Instruct 모델을 사용하여 문서로부터 구조화된 정보를 추출합니다. Hugging Face Transformers 라이브러리를 통해 실행되며, [Outlines](https://github.com/dottxt-ai/outlines) 라이브러리를 사용하여 토큰 샘플링 확률을 제한함으로써 **정형화된 출력 형식을 생성**하도록 돕습니다.\n","\n","\u003e 이 방식은 [Outlines tutorial](https://dottxt-ai.github.io/outlines/latest/cookbook/atomic_caption/) 기반으로 진행합니다.\n","\n","## Dependencies and imports\n","\n","첫번째로, 필수 라이브러리를 설치합니다."]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2605,"status":"ok","timestamp":1745324206997,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"MEX-Hh6lU-9A","outputId":"a06c8ff2-3b63-40ff-d384-28c198a13a11"},"outputs":[{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: accelerate in /usr/local/lib/python3.11/dist-packages (1.5.2)\n","Requirement already satisfied: outlines in /usr/local/lib/python3.11/dist-packages (0.2.3)\n","Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.51.3)\n","Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.6.0+cu124)\n","Requirement already satisfied: flash-attn in /usr/local/lib/python3.11/dist-packages (2.7.4.post1)\n","Requirement already satisfied: datasets in /usr/local/lib/python3.11/dist-packages (3.5.0)\n","Requirement already satisfied: sentencepiece in /usr/local/lib/python3.11/dist-packages (0.2.0)\n","Requirement already satisfied: numpy\u003c3.0.0,\u003e=1.17 in /usr/local/lib/python3.11/dist-packages (from accelerate) (2.0.2)\n","Requirement already satisfied: packaging\u003e=20.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (24.2)\n","Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from accelerate) (5.9.5)\n","Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (from accelerate) (6.0.2)\n","Requirement already satisfied: huggingface-hub\u003e=0.21.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.30.2)\n","Requirement already satisfied: safetensors\u003e=0.4.3 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.5.3)\n","Requirement already satisfied: interegular in /usr/local/lib/python3.11/dist-packages (from outlines) (0.3.3)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from outlines) (3.1.6)\n","Requirement already satisfied: lark in /usr/local/lib/python3.11/dist-packages (from outlines) (1.2.2)\n","Requirement already satisfied: nest_asyncio in /usr/local/lib/python3.11/dist-packages (from outlines) (1.6.0)\n","Requirement already satisfied: cloudpickle in /usr/local/lib/python3.11/dist-packages (from outlines) (3.1.1)\n","Requirement already satisfied: diskcache in /usr/local/lib/python3.11/dist-packages (from outlines) (5.6.3)\n","Requirement already satisfied: pydantic\u003e=2.0 in /usr/local/lib/python3.11/dist-packages (from outlines) (2.11.3)\n","Requirement already satisfied: referencing in /usr/local/lib/python3.11/dist-packages (from outlines) (0.36.2)\n","Requirement already satisfied: jsonschema in /usr/local/lib/python3.11/dist-packages (from outlines) (4.23.0)\n","Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from outlines) (2.32.3)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from outlines) (4.67.1)\n","Requirement already satisfied: typing_extensions in /usr/local/lib/python3.11/dist-packages (from outlines) (4.13.2)\n","Requirement already satisfied: iso3166 in /usr/local/lib/python3.11/dist-packages (from outlines) (2.1.1)\n","Requirement already satisfied: airportsdata in /usr/local/lib/python3.11/dist-packages (from outlines) (20250224)\n","Requirement already satisfied: outlines_core==0.1.26 in /usr/local/lib/python3.11/dist-packages (from outlines) (0.1.26)\n","Requirement already satisfied: genson in /usr/local/lib/python3.11/dist-packages (from outlines) (1.3.0)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.18.0)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n","Requirement already satisfied: tokenizers\u003c0.22,\u003e=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.1)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.4.2)\n","Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2024.12.0)\n","Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n","Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n","Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n","Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch) (9.1.0.70)\n","Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.5.8)\n","Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch) (11.2.1.3)\n","Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch) (10.3.5.147)\n","Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch) (11.6.1.9)\n","Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch) (12.3.1.170)\n","Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.11/dist-packages (from torch) (0.6.2)\n","Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n","Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n","Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n","Requirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.2.0)\n","Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n","Requirement already satisfied: mpmath\u003c1.4,\u003e=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1-\u003etorch) (1.3.0)\n","Requirement already satisfied: einops in /usr/local/lib/python3.11/dist-packages (from flash-attn) (0.8.1)\n","Requirement already satisfied: pyarrow\u003e=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (18.1.0)\n","Requirement already satisfied: dill\u003c0.3.9,\u003e=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.3.8)\n","Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets) (2.2.2)\n","Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets) (3.5.0)\n","Requirement already satisfied: multiprocess\u003c0.70.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.70.16)\n","Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from datasets) (3.11.15)\n","Requirement already satisfied: aiohappyeyeballs\u003e=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp-\u003edatasets) (2.6.1)\n","Requirement already satisfied: aiosignal\u003e=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp-\u003edatasets) (1.3.2)\n","Requirement already satisfied: attrs\u003e=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp-\u003edatasets) (25.3.0)\n","Requirement already satisfied: frozenlist\u003e=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp-\u003edatasets) (1.5.0)\n","Requirement already satisfied: multidict\u003c7.0,\u003e=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp-\u003edatasets) (6.4.3)\n","Requirement already satisfied: propcache\u003e=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp-\u003edatasets) (0.3.1)\n","Requirement already satisfied: yarl\u003c2.0,\u003e=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp-\u003edatasets) (1.19.0)\n","Requirement already satisfied: annotated-types\u003e=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic\u003e=2.0-\u003eoutlines) (0.7.0)\n","Requirement already satisfied: pydantic-core==2.33.1 in /usr/local/lib/python3.11/dist-packages (from pydantic\u003e=2.0-\u003eoutlines) (2.33.1)\n","Requirement already satisfied: typing-inspection\u003e=0.4.0 in /usr/local/lib/python3.11/dist-packages (from pydantic\u003e=2.0-\u003eoutlines) (0.4.0)\n","Requirement already satisfied: charset-normalizer\u003c4,\u003e=2 in /usr/local/lib/python3.11/dist-packages (from requests-\u003eoutlines) (3.4.1)\n","Requirement already satisfied: idna\u003c4,\u003e=2.5 in /usr/local/lib/python3.11/dist-packages (from requests-\u003eoutlines) (3.10)\n","Requirement already satisfied: urllib3\u003c3,\u003e=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests-\u003eoutlines) (2.3.0)\n","Requirement already satisfied: certifi\u003e=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests-\u003eoutlines) (2025.1.31)\n","Requirement already satisfied: MarkupSafe\u003e=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2-\u003eoutlines) (3.0.2)\n","Requirement already satisfied: jsonschema-specifications\u003e=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema-\u003eoutlines) (2024.10.1)\n","Requirement already satisfied: rpds-py\u003e=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema-\u003eoutlines) (0.24.0)\n","Requirement already satisfied: python-dateutil\u003e=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas-\u003edatasets) (2.8.2)\n","Requirement already satisfied: pytz\u003e=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas-\u003edatasets) (2025.2)\n","Requirement already satisfied: tzdata\u003e=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas-\u003edatasets) (2025.2)\n","Requirement already satisfied: six\u003e=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil\u003e=2.8.2-\u003epandas-\u003edatasets) (1.17.0)\n"]}],"source":["%pip install accelerate outlines transformers torch flash-attn datasets sentencepiece"]},{"cell_type":"markdown","metadata":{"id":"-0KQwjOLU-9A"},"source":["필수 라이브러리"]},{"cell_type":"code","execution_count":17,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1745324207001,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"Xo5DQMlhU-9A"},"outputs":[],"source":["import outlines # 구조화된 결과 생성을 돕는 라이브러리\n","import torch\n","\n","from datasets import load_dataset\n","from outlines.models.transformers_vision import transformers_vision\n","from transformers import AutoModelForImageTextToText, AutoProcessor\n","from pydantic import BaseModel"]},{"cell_type":"markdown","metadata":{"id":"AzbKrCEBU-9B"},"source":["## 모델 초기화\n","\n","먼저 [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) 모델을 초기화합니다. Outline은 모델 클래스와 프로세스 클래스를 정보를 명시적으로 전달해야 하므로, 해당 클래스를 반환하는 함수를 작성하여 사용하는 방식으로 예제를 구성합니다. 다른 방법으로는 [Hub repo files](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct/tree/main)에서 직접 모델과 토크나이저를 보고 직접 클래스를 지정하여 가져오는 방식도 사용할 수 있습니다."]},{"cell_type":"code","execution_count":18,"metadata":{"executionInfo":{"elapsed":9636,"status":"ok","timestamp":1745324216647,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"9fska9wcU-9B"},"outputs":[],"source":["model_name = \"HuggingFaceTB/SmolVLM-Instruct\"\n","\n","\n","def get_model_and_processor_class(model_name: str):\n"," model = AutoModelForImageTextToText.from_pretrained(model_name)\n"," processor = AutoProcessor.from_pretrained(model_name)\n"," classes = model.__class__, processor.__class__\n"," del model, processor\n"," return classes\n","\n","\n","model_class, processor_class = get_model_and_processor_class(model_name)\n","\n","if torch.cuda.is_available():\n"," device = \"cuda\"\n","elif torch.backends.mps.is_available():\n"," device = \"mps\"\n","else:\n"," device = \"cpu\"\n","\n","model = transformers_vision(\n"," model_name,\n"," model_class=model_class,\n"," # device=device,\n"," model_kwargs={\"torch_dtype\": torch.bfloat16, \"device_map\": \"auto\"},\n"," # processor_kwargs={\"device\": device},\n"," processor_class=processor_class,\n",")"]},{"cell_type":"code","execution_count":19,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":43,"status":"ok","timestamp":1745324216692,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"FkyCMvvad3AF","outputId":"710267f6-2005-4ade-e210-9943b394ed66"},"outputs":[{"name":"stdout","output_type":"stream","text":["\u003cclass 'transformers.models.idefics3.processing_idefics3.Idefics3Processor'\u003e\n","\u003cclass 'type'\u003e\n"]}],"source":["print(processor_class)\n","print(type(processor_class))"]},{"cell_type":"markdown","metadata":{"id":"nk8qPHgqU-9B"},"source":["## 구조화된 출력 생성\n","\n","이제 모델이 구조화된 출력을 생성하는 함수를 정의합니다. 이미지와 해당 이미지에 대한 질문-선택 응답, 거부 응답 쌍으로 구성되어 있는 [openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)을 사용합니다. 이 데이터세트를 활용할 수 있으나, 여기에 추가적으로 text=image-to-text 데이터를 생성하여 우리만의 **구조화된 데이터세트**를 만들고, 이후 모델을 fine-tuning 하는데 활용할 계획입니다. 구체적으로 이미지에 대해 캡션, 질문, 간단한 품질 판단 태그를 생성하는 작업을 수행할 것입니다."]},{"cell_type":"code","execution_count":20,"metadata":{"executionInfo":{"elapsed":826,"status":"ok","timestamp":1745324217522,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"uxilvTbKU-9C"},"outputs":[],"source":["class ImageData(BaseModel):\n"," quality: str\n"," description: str\n"," question: str\n","\n","structured_generator = outlines.generate.json(model, ImageData)"]},{"cell_type":"markdown","metadata":{"id":"qkhH04vRU-9C"},"source":["프롬프트를 작성합니다."]},{"cell_type":"code","execution_count":21,"metadata":{"executionInfo":{"elapsed":1,"status":"ok","timestamp":1745324217531,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"DSmWvVXnU-9C"},"outputs":[],"source":["prompt = \"\"\"\n","You are an image analysis assisant.\n","\n","Provide a quality tag, a description and a question.\n","\n","The quality can either be \"good\", \"okay\" or \"bad\".\n","The question should be concise and objective.\n","\n","Return your response as a valid JSON object.\n","\"\"\".strip()"]},{"cell_type":"markdown","metadata":{"id":"kOLeBzi9U-9C"},"source":["이미지 데이터세트를 로드합니다."]},{"cell_type":"code","execution_count":22,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2656,"status":"ok","timestamp":1745324220188,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"lUfgXf_dU-9C","outputId":"c53f0bce-966f-4daf-8733-67359af3eb08"},"outputs":[{"data":{"text/plain":["Dataset({\n"," features: ['ds_name', 'image', 'question', 'chosen', 'rejected', 'origin_dataset', 'origin_split', 'idx', 'image_path'],\n"," num_rows: 10\n","})"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["dataset = load_dataset(\"openbmb/RLAIF-V-Dataset\", split=\"train[:10]\")\n","dataset"]},{"cell_type":"markdown","metadata":{"id":"P9QiBSIJU-9C"},"source":["이미지로부터 구조화된 정보를 추출하는 함수를 정의합니다. `apply_chat_template` 메소드를 사용하여 프롬프트 형식을 지정하고, 이미지와 함께 형식을 모델에 전달합니다."]},{"cell_type":"code","execution_count":23,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":192},"executionInfo":{"elapsed":101000,"status":"ok","timestamp":1745324321234,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"o6GSAX__U-9C","outputId":"de186d54-b1f4-42b2-b0ef-4d41b1665299"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.11/dist-packages/dill/_dill.py:414: PicklingWarning: Cannot locate reference to \u003cclass '__main__.ImageData'\u003e.\n"," StockPickler.save(self, obj, save_persistent_id)\n","/usr/local/lib/python3.11/dist-packages/dill/_dill.py:414: PicklingWarning: Cannot pickle \u003cclass '__main__.ImageData'\u003e: __main__.ImageData has recursive self-references that trigger a RecursionError.\n"," StockPickler.save(self, obj, save_persistent_id)\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"b2ca6268b5e2482db961e0058aae8912","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/10 [00:00\u003c?, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"text/plain":["Dataset({\n"," features: ['ds_name', 'image', 'question', 'chosen', 'rejected', 'origin_dataset', 'origin_split', 'idx', 'image_path', 'synthetic_question', 'synthetic_description', 'synthetic_quality'],\n"," num_rows: 10\n","})"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["def extract(row):\n"," messages = [\n"," {\n"," \"role\": \"user\",\n"," \"content\": [{\"type\": \"image\"}, {\"type\": \"text\", \"text\": prompt}],\n"," },\n"," ]\n","\n"," formatted_prompt = model.processor.apply_chat_template( # apply_chat_template: Hugging Face에서 제공하는 chat 템플릿 적용 함수. 메시지를 모델이 이해할 수 있는 포맷으로 변환.\n"," messages, add_generation_prompt=True\n"," )\n","\n"," result = structured_generator(formatted_prompt, [row[\"image\"]])\n"," row['synthetic_question'] = result.question\n"," row['synthetic_description'] = result.description\n"," row['synthetic_quality'] = result.quality\n"," return row\n","\n","\n","dataset = dataset.map(lambda x: extract(x))\n","dataset"]},{"cell_type":"code","execution_count":24,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":74,"status":"ok","timestamp":1745324321311,"user":{"displayName":"염지현","userId":"09299668898672663436"},"user_tz":-540},"id":"TkW2j4GSAUkf","outputId":"e38473b0-99d3-4ed1-d764-0825c0b5e47a"},"outputs":[{"data":{"text/plain":["Dataset({\n"," features: ['ds_name', 'image', 'question', 'chosen', 'rejected', 'origin_dataset', 'origin_split', 'idx', 'image_path', 'synthetic_question', 'synthetic_description', 'synthetic_quality'],\n"," num_rows: 10\n","})"]},"execution_count":24,"metadata":{},"output_type":"execute_result"}],"source":["dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true,"base_uri":"https://localhost:8080/"},"id":"GYSu7MIvAqR9"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"," _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|\n"," _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n"," _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|\n"," _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n"," _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|\n","\n"," A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.\n"," Setting a new token will erase the existing one.\n"," To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n","Enter your token (input will not be visible): "]}],"source":["!huggingface-cli login\n"]},{"cell_type":"markdown","metadata":{"id":"ZKp0rFAEU-9C"},"source":["새로 만든 데이터세트를 Hugging Face Hub에 업로드 해봅시다."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"hbrybnULBrdT"},"outputs":[],"source":["from huggingface_hub import create_repo\n","create_repo(\"structured-generation-mydata\", repo_type=\"dataset\")\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"qYd_mqS1U-9D"},"outputs":[],"source":["# dataset.push_to_hub(\"davidberenstein1957/structured-generation-information-extraction-vlms-openbmb-RLAIF-V-Dataset\", split=\"train\")\n","\n","dataset.push_to_hub(\"yeomjihyun/structured-generation-mydata\", split=\"train\")"]},{"cell_type":"markdown","metadata":{"id":"g347MfxIU-9D"},"source":["\u003ciframe\n"," src=\"https://huggingface.co/datasets/davidberenstein1957/structured-generation-information-extraction-vlms-openbmb-RLAIF-V-Dataset/embed/viewer/default/train?row=3\"\n"," frameborder=\"0\"\n"," width=\"100%\"\n"," height=\"560px\"\n","\u003e\u003c/iframe\u003e"]},{"cell_type":"markdown","metadata":{"id":"2UL0Rl4AU-9D"},"source":["결과가 완벽하지 않다면 다른 모델과 프롬프트를 실험해보기에 좋은 출발점입니다!"]},{"cell_type":"markdown","metadata":{"id":"-86wWnueU-9D"},"source":["## 결론\n","\n","이번 튜토리얼에서는 비전-언어 모델을 사용해 문서에서 구조화된 정보를 추출하는 방법을 알아보았습니다.\n","비슷한 추출 방법을 활용하면 PDF 문서를 pdf2image와 같은 도구를 사용해 이미지로 변환한 후, 각 페이지 이미지에 대해 정보를 추출할 수 있습니다.\n","\n","```python\n","pdf_path = \"path/to/your/pdf/file.pdf\"\n","pages = convert_from_path(pdf_path)\n","for page in pages:\n"," extract_objects = extract_objects(page, prompt)\n","```\n","\n","## 다음 단계\n","[Outlines](https://github.com/outlines-ai/outlines) 라이브러리를 참고하여 다양한 사용법과 파라미터를 더 알아보세요.\n","\n","여러분의 모델을 사용해, 직접 사용 사례에 맞는 정보를 추출해보세요.\n","\n","문서에서 구조화된 정보를 추출하는 다른 방법도 탐색해보세요."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"E23dEbpSNw8E"},"outputs":[],"source":[]}],"metadata":{"accelerator":"GPU","colab":{"gpuType":"T4","machine_shape":"hm","name":"","provenance":[{"file_id":"https://github.com/huggingface/cookbook/blob/main/notebooks/en/structured_generation_vision_language_models.ipynb","timestamp":1745106844202}],"version":""},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.11"},"widgets":{"application/vnd.jupyter.widget-state+json":{"104bc67a4a76413ab262beea87786239":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_f500bea965d64b76a99e92b7f37475ee","placeholder":"​","style":"IPY_MODEL_609b299fd437480b8438de3cfc9f3c4c","value":"Map: 100%"}},"13d6da0a7db345d380088d7ab537f3d7":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3ec19b665bb749cba983e971afca56e3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"425561cf8a4648e19920a809d513d161":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"609b299fd437480b8438de3cfc9f3c4c":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"62e446b957074c2cadb28c073c8be4c6":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"67e03675751548c4b1c4792db3b2d080":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_425561cf8a4648e19920a809d513d161","placeholder":"​","style":"IPY_MODEL_82a2266f9125496ba6b0430b9841bbe3","value":" 10/10 [01:24\u0026lt;00:00,  8.94s/ examples]"}},"82a2266f9125496ba6b0430b9841bbe3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"8daa9bd5dad44d538471cbe284db047d":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_13d6da0a7db345d380088d7ab537f3d7","max":10,"min":0,"orientation":"horizontal","style":"IPY_MODEL_3ec19b665bb749cba983e971afca56e3","value":10}},"b2ca6268b5e2482db961e0058aae8912":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_104bc67a4a76413ab262beea87786239","IPY_MODEL_8daa9bd5dad44d538471cbe284db047d","IPY_MODEL_67e03675751548c4b1c4792db3b2d080"],"layout":"IPY_MODEL_62e446b957074c2cadb28c073c8be4c6"}},"f500bea965d64b76a99e92b7f37475ee":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}}}}},"nbformat":4,"nbformat_minor":0} \ No newline at end of file