diff --git a/.DS_Store b/.DS_Store index 3d050a7..75bcbe7 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/python/.DS_Store b/python/.DS_Store deleted file mode 100644 index 6989235..0000000 Binary files a/python/.DS_Store and /dev/null differ diff --git a/python/post-processing-samples/ACORD127.ipynb b/python/post-processing-samples/ACORD127.ipynb new file mode 100644 index 0000000..0fb7b2b --- /dev/null +++ b/python/post-processing-samples/ACORD127.ipynb @@ -0,0 +1,462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1c75471f", + "metadata": {}, + "source": [ + "# Processing ACORD-127 forms using Textract post-processing libraries" + ] + }, + { + "cell_type": "markdown", + "id": "505aa812", + "metadata": {}, + "source": [ + "## Use case : Hierarchical Key-Value mapping\n", + "\n", + "When a document contains hierarchical structuring, it is an important IDP post-processing task to infer the context within the structures. For eg.: In the document sample of an ACORD 127 form - Page 3 below, we want to infer the relationships between each `Veh #` keys to their corresponding vehical information items within each highlighted section. This can be done using Textract post-processing libraries for hierarchical key-value pairs mapping. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f5c72002", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from PIL import Image\n", + "from IPython.display import Image, display, HTML, JSON, IFrame\n", + "\n", + "documentName = \"doc-samples/Acord-127-pg3-annotated.png\"\n", + "display(IFrame(documentName, 500, 600));" + ] + }, + { + "cell_type": "markdown", + "id": "0247490c", + "metadata": {}, + "source": [ + "## Step 1: Installation\n" + ] + }, + { + "cell_type": "markdown", + "id": "d8905718", + "metadata": {}, + "source": [ + "### [Amazon Textract Geofinder](https://pypi.org/project/amazon-textract-geofinder/)\n", + "Amazon Textract package to easier access data through geometric information.\n", + "\n", + "Provides functions to use geometric information to extract information.\n", + "\n", + "Use cases include:\n", + "\n", + " - Give context to key/value pairs from the Amazon Textract AnalyzeDocument API for FORMS\n", + " - Find values in specific areas\n", + " \n", + "### Other helper libraries for Textract response parsing : \n", + "\n", + "- Using call_textract( ) from the [Textract-Caller](https://github.com/aws-samples/amazon-textract-textractor/tree/c689441c0562afb4976d4f248559e59289a33777/caller) library makes it is easy to parse JSON responses from AnalyzeDocument API.\n", + "\n", + "- Also using [Textract-PrettyPrinter](https://github.com/aws-samples/amazon-textract-textractor/tree/master/prettyprinter) library that provides functions to format the output received from Textract in more easily consumable formats such as CSV.\n", + "\n", + "You will need to run the cell below only once for installation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "738b33d2", + "metadata": { + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!python -m pip install amazon-textract-helper amazon-textract-geofinder amazon-textract-caller amazon-textract-response-parser --upgrade " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8b6d343", + "metadata": {}, + "outputs": [], + "source": [ + "from textractgeofinder.ocrdb import AreaSelection\n", + "from textractgeofinder.tgeofinder import KeyValue, TGeoFinder, AreaSelection, SelectionElement\n", + "from textractprettyprinter.t_pretty_print import get_forms_string, convert_form_to_list_trp2\n", + "\n", + "from textractcaller.t_call import call_textract\n", + "from textractcaller.t_call import Textract_Features\n", + "\n", + "import trp.trp2 as t2" + ] + }, + { + "cell_type": "markdown", + "id": "ca0940f5", + "metadata": {}, + "source": [ + "This is the image we want to extract information from." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9f73561", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "from IPython.display import Image, display, HTML, JSON, IFrame\n", + "\n", + "documentName = \"doc-samples/Acord-127-pg3.png\"\n", + "display(IFrame(documentName, 500, 600));" + ] + }, + { + "cell_type": "markdown", + "id": "4031e636", + "metadata": {}, + "source": [ + "Calling Amazon Textract with the textractcaller library is easy. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65cb6ead", + "metadata": {}, + "outputs": [], + "source": [ + "j = call_textract(input_document=documentName, features=[Textract_Features.FORMS, Textract_Features.TABLES])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dbb702a", + "metadata": {}, + "outputs": [], + "source": [ + "print(get_forms_string(j))" + ] + }, + { + "cell_type": "markdown", + "id": "23ad2d9c", + "metadata": {}, + "source": [ + "There are multiple blocktypes in JSON. We need to deserialize the JSON into blocks using trp library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92a472c6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#from textractcaller.t_call import call_textract, Textract_Features\n", + "from trp.trp2 import TDocument, TDocumentSchema\n", + "from trp.t_pipeline import order_blocks_by_geo\n", + "import trp\n", + "\n", + "t_doc = TDocumentSchema().load(j)\n", + "# the ordered_doc has elements ordered by y-coordinate (top to bottom of page)\n", + "ordered_doc = order_blocks_by_geo(t_doc)\n", + "# send to trp for further processing logic\n", + "trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc))\n", + "# print(trp_doc)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ec1e7a4", + "metadata": {}, + "outputs": [], + "source": [ + "for page in trp_doc.pages:\n", + " for field in page.form.fields:\n", + " key = field.key.text if field.key else \"\"\n", + " value = field.value.text if field.value else \"\"\n", + " print(key, \":\" ,value)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbdf6285", + "metadata": {}, + "outputs": [], + "source": [ + "t_document = t2.TDocumentSchema().load(j)\n", + "doc_height = 1000\n", + "doc_width = 1000\n", + "geofinder_doc = TGeoFinder(j, doc_height=doc_height, doc_width=doc_width)" + ] + }, + { + "cell_type": "markdown", + "id": "709e4811", + "metadata": {}, + "source": [ + "## Step 2 : Using BoundingBox information given by Textract\n", + "\n", + "The [bounding box](https://docs.aws.amazon.com/textract/latest/dg/API_BoundingBox.html) around the detected page, text, key-value pair, table, table cell, or selection element on a document page. \n", + "\n", + "We will now find the `Top` variable which is the top coordinate of the bounding box as a ratio of overall document page height.\n", + "\n", + "This top variable will be the Y coodinates for each of the `Veh #` from the ACORD 127 form.\n", + "The X and Y values that are returned are ratios of the overall document page size. For example, if the input document is 700 x 200 and the operation returns X=0.5 and Y=0.25, then the point is at the (350,50) pixel coordinate on the document page.\n", + "Since, our document dimensions is 1000x1000 (configured in the cell above), we multiple each `vehicle_top` variable by 1000 to get our Y coodinate for each vehicle number." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8119bd3", + "metadata": {}, + "outputs": [], + "source": [ + "vehicle_top_list = []\n", + "for page in trp_doc.pages:\n", + " for field in page.form.fields:\n", + " key = field.key.text if field.key else \"\"\n", + " value = field.value.text if field.value else \"\"\n", + " if (key).lower().startswith('veh'):\n", + " vehicle_top = field.geometry.boundingBox.top\n", + " vehicle_top_list.append((int(vehicle_top*1000)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ab04c43", + "metadata": {}, + "outputs": [], + "source": [ + "vehicle_top_list" + ] + }, + { + "cell_type": "markdown", + "id": "583d77c5", + "metadata": {}, + "source": [ + "Define `set_hierachy_kv()` a helper function to add \"virtual\" keys which we use to indicate context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48f1429e", + "metadata": {}, + "outputs": [], + "source": [ + "def set_hierarchy_kv(list_kv: list[KeyValue], t_document: t2.TDocument, page_block: t2.TBlock, prefix=\"BORROWER\"):\n", + " for x in list_kv:\n", + " t_document.add_virtual_key_for_existing_key(key_name=f\"{prefix}_{x.key.text}\",\n", + " existing_key=t_document.get_block_by_id(x.key.id),\n", + " page_block=page_block)" + ] + }, + { + "cell_type": "markdown", + "id": "e384b97c", + "metadata": {}, + "source": [ + "Let's see what does the second coordinate for lower-right look like? We can simply run the geo-finder method `find_phrase_on_page()` to find the 4 places where \"Total prem:\" appears in the document sample. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98738ec3", + "metadata": {}, + "outputs": [], + "source": [ + "geofinder_doc.find_phrase_on_page(\"total prem:\", min_textdistance=0.99)" + ] + }, + { + "cell_type": "markdown", + "id": "599d11fb", + "metadata": {}, + "source": [ + "## Step 3 : Map the Vehicle # to their corresponding kv pairs\n", + "\n", + "We find the relevant phrases in the document to specify the area of key value pairs related to the patient information.\n", + "\n", + "We then use this information to add new key value pairs to the Amazon Textract Response JSON Schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a4511a7", + "metadata": {}, + "outputs": [], + "source": [ + "# Vehicle 1 geo-info\n", + "top_left = t2.TPoint(y=vehicle_top_list[0], x=0)\n", + "total_prem_1 = geofinder_doc.find_phrase_on_page(\"total prem:\", min_textdistance=0.99)[0]\n", + "lower_right = t2.TPoint(y=total_prem_1.ymin, x=doc_width)\n", + "\n", + "# Vehicle 1 hierarchical key\n", + "form_fields = geofinder_doc.get_form_fields_in_area(\n", + " area_selection=AreaSelection(top_left=top_left, lower_right=lower_right, page_number=1))\n", + "set_hierarchy_kv(list_kv=form_fields,\n", + " t_document=t_document,\n", + " prefix='Veh_1',\n", + " page_block=t_document.pages[0])\n", + "\n", + "# Vehicle 2 geo-info\n", + "top_left = t2.TPoint(y=vehicle_top_list[1], x=0)\n", + "total_prem_2 = geofinder_doc.find_phrase_on_page(\"total prem:\", min_textdistance=0.99)[1]\n", + "lower_right = t2.TPoint(y=total_prem_2.ymin, x=doc_width)\n", + "\n", + "# Vehicle 2 hierarchical key\n", + "form_fields = geofinder_doc.get_form_fields_in_area(\n", + " area_selection=AreaSelection(top_left=top_left, lower_right=lower_right, page_number=1))\n", + "set_hierarchy_kv(list_kv=form_fields,\n", + " t_document=t_document,\n", + " prefix='Veh_2',\n", + " page_block=t_document.pages[0])\n", + "\n", + "# Vehicle 3 geo-info\n", + "top_left = t2.TPoint(y=vehicle_top_list[2], x=0)\n", + "total_prem_3 = geofinder_doc.find_phrase_on_page(\"total prem:\", min_textdistance=0.99)[2]\n", + "lower_right = t2.TPoint(y=total_prem_3.ymin, x=doc_width)\n", + "\n", + "# Vehicle 3 hierarchical key\n", + "form_fields = geofinder_doc.get_form_fields_in_area(\n", + " area_selection=AreaSelection(top_left=top_left, lower_right=lower_right, page_number=1))\n", + "set_hierarchy_kv(list_kv=form_fields,\n", + " t_document=t_document,\n", + " prefix='Veh_3',\n", + " page_block=t_document.pages[0])\n", + "\n", + "# Vehicle 4 geo-info\n", + "top_left = t2.TPoint(y=vehicle_top_list[3], x=0)\n", + "total_prem_4 = geofinder_doc.find_phrase_on_page(\"total prem:\", min_textdistance=0.99)[3]\n", + "lower_right = t2.TPoint(y=total_prem_4.ymin, x=doc_width)\n", + "\n", + "# Vehicle 4 hierarchical key\n", + "form_fields = geofinder_doc.get_form_fields_in_area(\n", + " area_selection=AreaSelection(top_left=top_left, lower_right=lower_right, page_number=1))\n", + "set_hierarchy_kv(list_kv=form_fields,\n", + " t_document=t_document,\n", + " prefix='Veh_4',\n", + " page_block=t_document.pages[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4e2fa2f", + "metadata": {}, + "outputs": [], + "source": [ + "print(get_forms_string(t2.TDocumentSchema().dump(t_document)))" + ] + }, + { + "cell_type": "markdown", + "id": "0cbdfb02", + "metadata": {}, + "source": [ + "## Step 4 : Print forms in Pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50e31685", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from textractprettyprinter.t_pretty_print import convert_form_to_list\n", + "from trp import Document\n", + "\n", + "tdoc=Document(t2.TDocumentSchema().dump(t_document))\n", + "\n", + "dfs = list()\n", + "for page in tdoc.pages:\n", + " dfs.append(pd.DataFrame(convert_form_to_list(trp_form=page.form)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5ac6b5e", + "metadata": {}, + "outputs": [], + "source": [ + "dfs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02e48123", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/post-processing-samples/CMS1500.ipynb b/python/post-processing-samples/CMS1500.ipynb new file mode 100644 index 0000000..3c05f43 --- /dev/null +++ b/python/post-processing-samples/CMS1500.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4578c90d", + "metadata": {}, + "source": [ + "# Post-processing CMS1500 forms using Textract Geofinder Library \n", + "\n", + "\n", + "\n", + "In document processing workflows, often times we need post-processing techniques to extract consistently formatted entities and improve the accuracy of the data ingested from documents to downstream systems. **Textractor** is a python package created to seamlessly work with [Amazon Textract](https://docs.aws.amazon.com/textract/latest/dg/what-is.html) a document intelligence service offering text recognition, table extraction, form processing, and much more. Whether you are making a one-off script or a complex distributed document processing pipeline, Textractor makes it easy to use Textract offering various post-processing capabilities.\n", + "\n", + "Below are the different amazon-textract-* packages, you can find them using the links below:\n", + "\n", + "- [amazon-textract-caller](https://github.com/aws-samples/amazon-textract-textractor/tree/master/caller) (to simplify calling Amazon Textract without additional dependencies)\n", + "- [amazon-textract-response-parser](https://pypi.org/project/amazon-textract-response-parser/) (to parse the JSON response returned by Textract APIs)\n", + "- [amazon-textract-overlayer](https://github.com/aws-samples/amazon-textract-textractor/tree/master/overlayer) (to draw bounding boxes around the document entities on the document image)\n", + "- [amazon-textract-prettyprinter](https://github.com/aws-samples/amazon-textract-textractor/tree/master/prettyprinter) (convert Amazon Textract response to CSV, text, markdown, ...)\n", + "- [amazon-textract-geofinder](https://github.com/aws-samples/amazon-textract-textractor/tree/master/tpipelinegeofinder) (extract specific information from document with methods that help navigate the document using geometry and relations, e. g. hierarchical key/value pairs)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "a0c0e70c", + "metadata": {}, + "source": [ + "## Installation\n", + "You will need to run the cell below only once for installation. \n", + "In this use case, Amazon Textract Geofinder is the main package that we will use. \n", + "\n", + "\n", + "[Amazon Textract Geofinder](https://pypi.org/project/amazon-textract-geofinder/) : Amazon Textract package to easier access data through geometric information and extract specific entities.\n", + "\n", + "Use cases include:\n", + "\n", + " - Give context to key/value pairs from the Amazon Textract AnalyzeDocument API for FORMS\n", + " - Find values in specific areas\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "163dde71", + "metadata": { + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "!python -m pip install amazon-textract-helper amazon-textract-geofinder" + ] + }, + { + "cell_type": "markdown", + "id": "df58d2ae", + "metadata": {}, + "source": [ + "## Notebook setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5f3a724f", + "metadata": {}, + "outputs": [], + "source": [ + "from textractgeofinder.ocrdb import AreaSelection\n", + "from textractgeofinder.tgeofinder import KeyValue, TGeoFinder, AreaSelection, SelectionElement\n", + "from textractprettyprinter.t_pretty_print import get_forms_string\n", + "from textractcaller import call_textract\n", + "from textractcaller.t_call import Textract_Features\n", + "import trp.trp2 as t2" + ] + }, + { + "cell_type": "markdown", + "id": "b39b7838", + "metadata": {}, + "source": [ + "### Other helper libraries for Textract response parsing : \n", + "\n", + "- Using call_textract( ) from the [Textract-Caller](https://github.com/aws-samples/amazon-textract-textractor/tree/c689441c0562afb4976d4f248559e59289a33777/caller) library makes it is easy to parse JSON responses from AnalyzeDocument API.\n", + "\n", + "- Also using [Textract-PrettyPrinter](https://github.com/aws-samples/amazon-textract-textractor/tree/master/prettyprinter) library that provides functions to format the output received from Textract in more easily consumable formats such as CSV." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "095f2108", + "metadata": {}, + "outputs": [], + "source": [ + "# path to the image/file \n", + "image_filename='./doc-samples/CMS1500.png'\n", + "\n", + "j = call_textract(input_document=image_filename, features=[Textract_Features.FORMS])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7faf96db", + "metadata": {}, + "outputs": [], + "source": [ + "# loading the response JSON to TDocumentSchema Object.\n", + "t_document = t2.TDocumentSchema().load(j)\n", + "doc_height = 1000\n", + "doc_width = 1000\n", + "# loading the response JSON to TGeoFinder Schema Object.\n", + "geofinder_doc = TGeoFinder(j, doc_height=doc_height, doc_width=doc_width)" + ] + }, + { + "cell_type": "markdown", + "id": "d247c630", + "metadata": {}, + "source": [ + "## Use case 1 : Hierarchical Key-Value mapping\n" + ] + }, + { + "cell_type": "markdown", + "id": "e382f2bd", + "metadata": {}, + "source": [ + "Here, we define set_hierachy_kv is a helper function to add \"virtual\" Heirarchical keys to indicate context to the leaf key-value pairs." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f5f47242", + "metadata": {}, + "outputs": [], + "source": [ + "def set_hierarchy_kv(list_kv: list[KeyValue], t_document: t2.TDocument, page_block: t2.TBlock, prefix=\"BORROWER\"):\n", + " for x in list_kv:\n", + " t_document.add_virtual_key_for_existing_key(key_name=f\"{prefix}_{x.key.text}\",\n", + " existing_key=t_document.get_block_by_id(x.key.id),\n", + " page_block=page_block)" + ] + }, + { + "cell_type": "markdown", + "id": "4c66ec9d", + "metadata": {}, + "source": [ + "We then find the relevant phrases in the document to specify the area of key value pairs related to the patient information. Further, we will use this information to add new key value pairs with their \"Hierarchical\" parent key to the Amazon Textract Response JSON Schema." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4907b3c8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Using geometrical information in the form for mapping keys to Item #6 Patient Relationship\n", + "\n", + "patient_dob = geofinder_doc.find_phrase_on_page(\"6. PATIENT RELATIONSHIP TO INSURED\")[0]\n", + "patient_relationship = geofinder_doc.find_phrase_on_page(\"8. PATIENT STATUS\",min_textdistance=0.99)[0]\n", + "\n", + "top_left = t2.TPoint(y=patient_dob.ymax, x=0)\n", + "lower_right = t2.TPoint(y=patient_relationship.ymin, x=doc_width)\n", + "\n", + "\n", + "form_fields = geofinder_doc.get_form_fields_in_area(\n", + " area_selection=AreaSelection(top_left=top_left, lower_right=lower_right, page_number=1))\n", + "set_hierarchy_kv(list_kv=form_fields,\n", + " t_document=t_document,\n", + " prefix='6_PT_RELATIONSHIP',\n", + " page_block=t_document.pages[0])\n" + ] + }, + { + "cell_type": "markdown", + "id": "df55c7a9", + "metadata": {}, + "source": [ + "All the keys now have a context which makes it possible to parse the response in downstream processes. \n", + "\n", + "For eg.: for Item #6, the keys now have a hierarchical key `6_PT_RELATIONSHIP` i.e. `6_PT_RELATIONSHIP_spouse` shows `SELECTED` and the rest of the keys show `NOT_SELECTED` in their values." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c4e3ebe0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|----------------------------------------------------------------------------------------------|----------------------------------------------------------|\n", + "| Key | Value |\n", + "| Single | NOT_SELECTED |\n", + "| 7. INSURED'S ADDRESS (No., Street) | 123 Any Street |\n", + "| CITY | Any City |\n", + "| 32. NAME AND ADDRESS OF FACILITY WHERE SERVICES WERE RENDERED (If other than home or office) | Mateo Jackson PhD 9876 Healthcare Ave Any Town, CA 92126 |\n", + "| Employed | NOT_SELECTED |\n", + "| TELEPHONE (Include Area Code) | ( 858 ) 555-0100 |\n", + "| Child | NOT_SELECTED |\n", + "| YES | NOT_SELECTED |\n", + "| NO | SELECTED |\n", + "| YES | SELECTED |\n", + "| Other | NOT_SELECTED |\n", + "| C. INSURANCE PLAN NAME OR PROGRAM NAME | Group Insur of Amer. |\n", + "| Part-Time Student | NOT_SELECTED |\n", + "| d. INSURANCE PLAN NAME OR PROGRAM NAME | |\n", + "| ZIP CODE | 92127 |\n", + "| NO | SELECTED |\n", + "| 2. PATIENT'S NAME (Last Name, First Name, Middle Initial) | Doe, John |\n", + "| Spouse | SELECTED |\n", + "| (Medicaid #) | NOT_SELECTED |\n", + "| Married | SELECTED |\n", + "| Full-Time Student | NOT_SELECTED |\n", + "| YES | NOT_SELECTED |\n", + "| (VA File #) | NOT_SELECTED |\n", + "| YES | NOT_SELECTED |\n", + "| YES | NOT_SELECTED |\n", + "| 10d. RESERVED FOR LOCAL USE | |\n", + "| ZIP CODE | 92127 |\n", + "| NO | NOT_SELECTED |\n", + "| Self | NOT_SELECTED |\n", + "| YES | NOT_SELECTED |\n", + "| TELEPHONE (INCLUDE AREA CODE) | ( 858 ) 555-0100 |\n", + "| 11. INSURED'S POLICY GROUP OR FECA NUMBER | G4683A |\n", + "| 5. PATIENT'S ADDRESS (No., Street) | 123 Any Street |\n", + "| (ID) | NOT_SELECTED |\n", + "| 9. OTHER INSURED'S NAME (Last Name, First Name, Middle Initial) | |\n", + "| DATE | 01-15-2021 |\n", + "| CITY | Any City |\n", + "| EIN | NOT_SELECTED |\n", + "| NO | NOT_SELECTED |\n", + "| Other | NOT_SELECTED |\n", + "| 17. NAME OF REFERRING PHYSICIAN OR OTHER SOURCE | Self |\n", + "| SIGNED | MtJackson |\n", + "| 29. AMOUNT PAID | $ |\n", + "| C. EMPLOYER'S NAME OR SCHOOL NAME | |\n", + "| (Medicare #) | SELECTED |\n", + "| 4. INSURED'S NAME (Last Name, First Name, Middle Initial) | Doe, Jane |\n", + "| 17a. I.D. NUMBER OF REFERRING PHYSICIAN | |\n", + "| MM | 10 |\n", + "| a. OTHER INSURED'S POLICY OR GROUP NUMBER | |\n", + "| DATE | 10/11/21 |\n", + "| FECA BLK LUNG (SSN) | NOT_SELECTED |\n", + "| M | SELECTED |\n", + "| b. EMPLOYER'S NAME OR SCHOOL NAME | |\n", + "| 23. PRIOR AUTHORIZATION NUMBER | |\n", + "| MM | 06 |\n", + "| SIGNED | JaneDoe |\n", + "| STATE | CA |\n", + "| GRP# | |\n", + "| YY | 21 |\n", + "| CODE | |\n", + "| M | NOT_SELECTED |\n", + "| NO | SELECTED |\n", + "| F | SELECTED |\n", + "| NO | SELECTED |\n", + "| 19. RESERVED FOR LOCAL USE | |\n", + "| DD | 11 |\n", + "| SIGNED | Jdoe |\n", + "| SSN | SELECTED |\n", + "| DD | |\n", + "| DD | 12 |\n", + "| 3. | R19 7 |\n", + "| 30. BALANCE DUE | $ |\n", + "| 26. PATIENT'S ACCOUNT NO. | |\n", + "| 4. | K92 1 |\n", + "| ORIGINAL REF. NO. | |\n", + "| STATE | CA |\n", + "| SEX | F |\n", + "| M | NOT_SELECTED |\n", + "| 1a. INSURED'S I.D. NUMBER | 11-2234-10190 |\n", + "| DD | |\n", + "| MM | 10 |\n", + "| GROUP HEALTH PLAN (SSN or ID) | NOT_SELECTED |\n", + "| DD | |\n", + "| MM | |\n", + "| 25. FEDERAL TAX I.D. NUMBER | 555-88-9999 |\n", + "| DD | |\n", + "| YY | |\n", + "| DD | |\n", + "| DD | |\n", + "| YY | |\n", + "| PICA | NOT_SELECTED |\n", + "| 10. IS PATIENT'S CONDITION RELATED TO: | |\n", + "| (Sponsor's SSN) | NOT_SELECTED |\n", + "| F | NOT_SELECTED |\n", + "| PIN# | Any Town, CA 92126 |\n", + "| TO | |\n", + "| PICA | NOT_SELECTED |\n", + "| 1. | R11 0 |\n", + "| MM | |\n", + "| MM | |\n", + "| 2. | K59 00 |\n", + "| YY | |\n", + "| MM | |\n", + "| (FOR PROGRAM IN ITEM 1) | |\n", + "| 28. TOTAL CHARGE | $ 405.00 |\n", + "| 24. | A |\n", + "| 6_PT_RELATIONSHIP_self | NOT_SELECTED |\n", + "| 6_PT_RELATIONSHIP_spouse | SELECTED |\n", + "| 6_PT_RELATIONSHIP_child | NOT_SELECTED |\n", + "| 6_PT_RELATIONSHIP_other | NOT_SELECTED |\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(get_forms_string(t2.TDocumentSchema().dump(t_document)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dbc7c7c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/post-processing-samples/doc-samples/Acord-127-pg3-annotated.png b/python/post-processing-samples/doc-samples/Acord-127-pg3-annotated.png new file mode 100644 index 0000000..47c7cf2 Binary files /dev/null and b/python/post-processing-samples/doc-samples/Acord-127-pg3-annotated.png differ diff --git a/python/post-processing-samples/doc-samples/Acord-127-pg3.png b/python/post-processing-samples/doc-samples/Acord-127-pg3.png new file mode 100644 index 0000000..7c9784d Binary files /dev/null and b/python/post-processing-samples/doc-samples/Acord-127-pg3.png differ diff --git a/python/post-processing-samples/doc-samples/CMS1500.png b/python/post-processing-samples/doc-samples/CMS1500.png new file mode 100644 index 0000000..1caea71 Binary files /dev/null and b/python/post-processing-samples/doc-samples/CMS1500.png differ