This repository was archived by the owner on Sep 20, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 30
docs(samples): new Doc AI samples for v1beta3 #44
Merged
Merged
Changes from all commits
Commits
Show all changes
49 commits
Select commit
Hold shift + click to select a range
b2dc573
batch_process_sample. changing from async to synchronous
aribray b01d802
add quick start and process_document samples and tests
aribray cfb964a
add test and sample for batch_process
aribray 8f9246d
add test and sample for batch_process
aribray ba7681a
resolve merge conflict
aribray a37f39a
python document ai samples
aribray 99f7f11
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 87254c7
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray bcf97a6
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 26b9450
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 4943437
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 9439937
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 15dd4e4
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 0943fba
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 01058fe
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray d616c54
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 0b18336
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 4d08bf4
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 6ee7994
Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray dc24b32
resolve formatting
aribray 82e8ab9
use os.environ
aribray c373ac3
remove os.path.join
aribray 6389213
move tests
aribray 37cd427
descriptive variable
aribray aef335e
specific Exception, formatting
aribray a4d2b4a
parse all pages in process_document
aribray bbc187e
add more helpful comments
aribray dd6488f
remove unused imports
aribray 2179581
better exception handling
aribray 3cb2c0a
rename test files
aribray f424aee
Merge branch 'master' into python-docai
aribray 27b63f1
Merge branch 'master' into python-docai
aribray 043b445
ran linter, removed nested function in batch predict
aribray dba5ef8
refactor tests
aribray 5416bbc
format imports
aribray d9e2cca
format imports
aribray 700ab75
format imports
aribray 66dde36
serialize as Document object
aribray 0b839e8
extract get_text helper function
aribray bda06e8
fix file path
aribray ad5ff58
delete test bucket
aribray cd4a1d1
Update samples/snippets/batch_process_documents_sample_v1beta3_test.py
aribray e9ba609
Update samples/snippets/batch_process_documents_sample_v1beta3_test.py
aribray a439e32
add more specific assertion in batch_process
aribray 4e3f369
add more specific assertion in process_document and quickstart
aribray 9c7adaf
fix output_uri name
aribray 0849731
Apply suggestions from code review to resolve exception
aribray 61f0c7f
resolve exception
aribray 80d0fb4
lint
aribray File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Empty file.
Empty file.
121 changes: 121 additions & 0 deletions
121
samples/snippets/batch_process_documents_sample_v1beta3.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,121 @@ | ||
| # Copyright 2020 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| # [START documentai_batch_process_document] | ||
| import re | ||
|
|
||
| from google.cloud import documentai_v1beta3 as documentai | ||
| from google.cloud import storage | ||
|
|
||
| # TODO(developer): Uncomment these variables before running the sample. | ||
| # project_id= 'YOUR_PROJECT_ID' | ||
| # location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu' | ||
| # processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console | ||
| # input_uri = "YOUR_INPUT_URI" | ||
| # gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI" | ||
| # gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" | ||
|
|
||
|
|
||
| def batch_process_documents( | ||
| project_id, | ||
| location, | ||
| processor_id, | ||
| gcs_input_uri, | ||
| gcs_output_uri, | ||
| gcs_output_uri_prefix, | ||
| ): | ||
|
|
||
| client = documentai.DocumentProcessorServiceClient() | ||
|
|
||
| destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/" | ||
|
|
||
| # 'mime_type' can be 'application/pdf', 'image/tiff', | ||
| # and 'image/gif', or 'application/json' | ||
| input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig( | ||
| gcs_source=gcs_input_uri, mime_type="application/pdf" | ||
| ) | ||
|
|
||
| # Where to write results | ||
| output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig( | ||
| gcs_destination=destination_uri | ||
| ) | ||
|
|
||
| # Location can be 'us' or 'eu' | ||
| name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" | ||
| request = documentai.types.document_processor_service.BatchProcessRequest( | ||
| name=name, | ||
| input_configs=[input_config], | ||
| output_config=output_config, | ||
| ) | ||
|
|
||
| operation = client.batch_process_documents(request) | ||
|
|
||
| # Wait for the operation to finish | ||
| operation.result() | ||
|
|
||
| # Results are written to GCS. Use a regex to find | ||
| # output files | ||
| match = re.match(r"gs://([^/]+)/(.+)", destination_uri) | ||
| output_bucket = match.group(1) | ||
| prefix = match.group(2) | ||
|
|
||
| storage_client = storage.Client() | ||
| bucket = storage_client.get_bucket(output_bucket) | ||
| blob_list = list(bucket.list_blobs(prefix=prefix)) | ||
| print("Output files:") | ||
|
|
||
| for i, blob in enumerate(blob_list): | ||
| # Download the contents of this blob as a bytes object. | ||
| blob_as_bytes = blob.download_as_bytes() | ||
| document = documentai.types.Document.from_json(blob_as_bytes) | ||
|
|
||
| print(f"Fetched file {i + 1}") | ||
|
|
||
| # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document | ||
|
|
||
| # Read the text recognition output from the processor | ||
| for page in document.pages: | ||
| for form_field in page.form_fields: | ||
| field_name = get_text(form_field.field_name, document) | ||
| field_value = get_text(form_field.field_value, document) | ||
| print("Extracted key value pair:") | ||
| print(f"\t{field_name}, {field_value}") | ||
| for paragraph in document.pages: | ||
| paragraph_text = get_text(paragraph.layout, document) | ||
| print(f"Paragraph text:\n{paragraph_text}") | ||
|
|
||
|
|
||
| # Extract shards from the text field | ||
| def get_text(doc_element: dict, document: dict): | ||
| """ | ||
| Document AI identifies form fields by their offsets | ||
| in document text. This function converts offsets | ||
| to text snippets. | ||
| """ | ||
| response = "" | ||
| # If a text segment spans several lines, it will | ||
| # be stored in different text segments. | ||
| for segment in doc_element.text_anchor.text_segments: | ||
| start_index = ( | ||
| int(segment.start_index) | ||
| if "start_index" in doc_element.text_anchor.__dict__ | ||
| else 0 | ||
| ) | ||
| end_index = int(segment.end_index) | ||
| response += document.text[start_index:end_index] | ||
| return response | ||
|
|
||
|
|
||
| # [END documentai_batch_process_document] | ||
62 changes: 62 additions & 0 deletions
62
samples/snippets/batch_process_documents_sample_v1beta3_test.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| # Copyright 2020 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
|
|
||
| import os | ||
| from uuid import uuid4 | ||
|
|
||
| from google.cloud import storage | ||
| from google.cloud.exceptions import NotFound | ||
|
|
||
| import pytest | ||
|
|
||
| from samples.snippets import batch_process_documents_sample_v1beta3 | ||
|
|
||
| location = "us" | ||
| project_id = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
| processor_id = "90484cfdedb024f6" | ||
| gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf" | ||
| gcs_output_uri_prefix = uuid4() | ||
| BUCKET_NAME = f"document-ai-python-{uuid4()}" | ||
|
|
||
aribray marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| @pytest.fixture(scope="module") | ||
| def test_bucket(): | ||
| storage_client = storage.Client() | ||
| bucket = storage_client.create_bucket(BUCKET_NAME) | ||
| yield bucket.name | ||
|
|
||
| try: | ||
| blobs = list(bucket.list_blobs()) | ||
| for blob in blobs: | ||
| blob.delete() | ||
| bucket.delete() | ||
| except NotFound: | ||
| print("Bucket already deleted.") | ||
|
|
||
|
|
||
| def test_batch_process_documents(capsys, test_bucket): | ||
| batch_process_documents_sample_v1beta3.batch_process_documents( | ||
| project_id=project_id, | ||
| location=location, | ||
| processor_id=processor_id, | ||
| gcs_input_uri=gcs_input_uri, | ||
| gcs_output_uri=f"gs://{test_bucket}", | ||
| gcs_output_uri_prefix=gcs_output_uri_prefix, | ||
| ) | ||
| out, _ = capsys.readouterr() | ||
|
|
||
| assert "Extracted" in out | ||
| assert "Paragraph" in out | ||
aribray marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| assert "Invoice" in out | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| # Copyright 2020 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
|
|
||
| from google.cloud import documentai_v1beta3 as documentai | ||
|
|
||
| # [START documentai_process_document] | ||
|
|
||
| # TODO(developer): Uncomment these variables before running the sample. | ||
| # project_id= 'YOUR_PROJECT_ID'; | ||
| # location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' | ||
| # processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console | ||
| # file_path = '/path/to/local/pdf'; | ||
|
|
||
|
|
||
| def process_document_sample( | ||
| project_id: str, location: str, processor_id: str, file_path: str | ||
| ): | ||
| # Instantiates a client | ||
| client = documentai.DocumentProcessorServiceClient() | ||
|
|
||
| # The full resource name of the processor, e.g.: | ||
| # projects/project-id/locations/location/processor/processor-id | ||
| # You must create new processors in the Cloud Console first | ||
| name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" | ||
|
|
||
| with open(file_path, "rb") as image: | ||
| image_content = image.read() | ||
|
|
||
| # Read the file into memory | ||
| document = {"content": image_content, "mime_type": "application/pdf"} | ||
|
|
||
| # Configure the process request | ||
| request = {"name": name, "document": document} | ||
|
|
||
| # Recognizes text entities in the PDF document | ||
| result = client.process_document(request=request) | ||
|
|
||
| document = result.document | ||
|
|
||
| print("Document processing complete.") | ||
|
|
||
| # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document | ||
|
|
||
| document_pages = document.pages | ||
|
|
||
| # Read the text recognition output from the processor | ||
| print("The document contains the following paragraphs:") | ||
| for page in document_pages: | ||
| paragraphs = page.paragraphs | ||
| for paragraph in paragraphs: | ||
| paragraph_text = get_text(paragraph.layout, document) | ||
| print(f"Paragraph text: {paragraph_text}") | ||
|
|
||
|
|
||
| # Extract shards from the text field | ||
| def get_text(doc_element: dict, document: dict): | ||
| """ | ||
| Document AI identifies form fields by their offsets | ||
| in document text. This function converts offsets | ||
| to text snippets. | ||
| """ | ||
| response = "" | ||
| # If a text segment spans several lines, it will | ||
| # be stored in different text segments. | ||
| for segment in doc_element.text_anchor.text_segments: | ||
| start_index = ( | ||
| int(segment.start_index) | ||
| if segment.start_index in doc_element.text_anchor.text_segments | ||
| else 0 | ||
| ) | ||
| end_index = int(segment.end_index) | ||
| response += document.text[start_index:end_index] | ||
| return response | ||
|
|
||
|
|
||
| # [END documentai_process_document] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| # # Copyright 2020 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
|
|
||
| import os | ||
|
|
||
| from samples.snippets import process_document_sample_v1beta3 | ||
|
|
||
|
|
||
| location = "us" | ||
| project_id = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
| processor_id = "90484cfdedb024f6" | ||
| file_path = "resources/invoice.pdf" | ||
|
|
||
|
|
||
| def test_process_documents(capsys): | ||
| process_document_sample_v1beta3.process_document_sample( | ||
| project_id=project_id, | ||
| location=location, | ||
| processor_id=processor_id, | ||
| file_path=file_path, | ||
| ) | ||
| out, _ = capsys.readouterr() | ||
|
|
||
| assert "Paragraph" in out | ||
aribray marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| assert "Invoice" in out | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.