Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
__version__ = "3.4.0" # {x-release-please-version}
__version__ = "0.0.0" # {x-release-please-version}
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
__version__ = "3.4.0" # {x-release-please-version}
__version__ = "0.0.0" # {x-release-please-version}
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
__version__ = "3.4.0" # {x-release-please-version}
__version__ = "0.0.0" # {x-release-please-version}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ class Document(proto.Message):
representations use base64.

This field is a member of `oneof`_ ``source``.
docid (str):
Optional. An internal identifier for
document. Should be loggable (no PII).
mime_type (str):
An IANA published `media type (MIME
type) <https://www.iana.org/assignments/media-types/media-types.xhtml>`__.
Expand Down Expand Up @@ -108,6 +111,12 @@ class Document(proto.Message):
Parsed layout of the document.
chunked_document (google.cloud.documentai_v1beta3.types.Document.ChunkedDocument):
Document chunked based on chunking config.
blob_assets (MutableSequence[google.cloud.documentai_v1beta3.types.Document.BlobAsset]):
Optional. The blob assets in this document.
This is used to store the content of the inline
blobs in this document, e.g. image bytes, such
that it can be referenced by other fields in the
document via asset id.
"""

class ShardInfo(proto.Message):
Expand Down Expand Up @@ -1819,6 +1828,20 @@ class TextChange(proto.Message):
message="Document.Provenance",
)

class Annotations(proto.Message):
r"""Represents the annotation of a block or a chunk.

Attributes:
description (str):
The description of the content with this
annotation.
"""

description: str = proto.Field(
proto.STRING,
number=1,
)

class DocumentLayout(proto.Message):
r"""Represents the parsed layout of a document as a collection of
blocks that the document is divided into.
Expand Down Expand Up @@ -1851,11 +1874,17 @@ class DocumentLayoutBlock(proto.Message):
list_block (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutListBlock):
Block consisting of list content/structure.

This field is a member of `oneof`_ ``block``.
image_block (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutImageBlock):
Block consisting of image content.

This field is a member of `oneof`_ ``block``.
block_id (str):
ID of the block.
page_span (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutPageSpan):
Page span of the block.
bounding_box (google.cloud.documentai_v1beta3.types.BoundingPoly):
Identifies the bounding box for the block.
"""

class LayoutPageSpan(proto.Message):
Expand Down Expand Up @@ -2028,6 +2057,74 @@ class LayoutListEntry(proto.Message):
message="Document.DocumentLayout.DocumentLayoutBlock",
)

class LayoutImageBlock(proto.Message):
r"""Represents an image type block.

This message has `oneof`_ fields (mutually exclusive fields).
For each oneof, at most one member field can be set at the same time.
Setting any member of the oneof automatically clears all other
members.

.. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields

Attributes:
blob_asset_id (str):
Optional. Asset id of the inline image. If set, find the
image content in the blob_assets field.

This field is a member of `oneof`_ ``image_source``.
gcs_uri (str):
Optional. Google Cloud Storage uri of the
image.

This field is a member of `oneof`_ ``image_source``.
data_uri (str):
Optional. Data uri of the image. It is composed of four
parts: a prefix (data:), a MIME type indicating the type of
data, an optional base64 token if non-textual, and the data
itself: data:[][;base64],

This field is a member of `oneof`_ ``image_source``.
mime_type (str):
Mime type of the image. An IANA published [media type (MIME
type)]
(https://www.iana.org/assignments/media-types/media-types.xhtml).
image_text (str):
Text extracted from the image using OCR or
alt text describing the image.
annotations (google.cloud.documentai_v1beta3.types.Document.Annotations):
Annotation of the image block.
"""

blob_asset_id: str = proto.Field(
proto.STRING,
number=4,
oneof="image_source",
)
gcs_uri: str = proto.Field(
proto.STRING,
number=5,
oneof="image_source",
)
data_uri: str = proto.Field(
proto.STRING,
number=6,
oneof="image_source",
)
mime_type: str = proto.Field(
proto.STRING,
number=1,
)
image_text: str = proto.Field(
proto.STRING,
number=2,
)
annotations: "Document.Annotations" = proto.Field(
proto.MESSAGE,
number=3,
message="Document.Annotations",
)

text_block: "Document.DocumentLayout.DocumentLayoutBlock.LayoutTextBlock" = proto.Field(
proto.MESSAGE,
number=2,
Expand All @@ -2046,6 +2143,12 @@ class LayoutListEntry(proto.Message):
oneof="block",
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutListBlock",
)
image_block: "Document.DocumentLayout.DocumentLayoutBlock.LayoutImageBlock" = proto.Field(
proto.MESSAGE,
number=7,
oneof="block",
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutImageBlock",
)
block_id: str = proto.Field(
proto.STRING,
number=1,
Expand All @@ -2055,6 +2158,11 @@ class LayoutListEntry(proto.Message):
number=5,
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutPageSpan",
)
bounding_box: geometry.BoundingPoly = proto.Field(
proto.MESSAGE,
number=6,
message=geometry.BoundingPoly,
)

blocks: MutableSequence[
"Document.DocumentLayout.DocumentLayoutBlock"
Expand Down Expand Up @@ -2088,6 +2196,8 @@ class Chunk(proto.Message):
Page headers associated with the chunk.
page_footers (MutableSequence[google.cloud.documentai_v1beta3.types.Document.ChunkedDocument.Chunk.ChunkPageFooter]):
Page footers associated with the chunk.
chunk_fields (MutableSequence[google.cloud.documentai_v1beta3.types.Document.ChunkedDocument.Chunk.ChunkField]):
Chunk fields inside this chunk.
"""

class ChunkPageSpan(proto.Message):
Expand Down Expand Up @@ -2149,6 +2259,112 @@ class ChunkPageFooter(proto.Message):
message="Document.ChunkedDocument.Chunk.ChunkPageSpan",
)

class ImageChunkField(proto.Message):
r"""The image chunk field in the chunk.

This message has `oneof`_ fields (mutually exclusive fields).
For each oneof, at most one member field can be set at the same time.
Setting any member of the oneof automatically clears all other
members.

.. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields

Attributes:
blob_asset_id (str):
Optional. Asset id of the inline image. If set, find the
image content in the blob_assets field.

This field is a member of `oneof`_ ``image_source``.
gcs_uri (str):
Optional. Google Cloud Storage uri of the
image.

This field is a member of `oneof`_ ``image_source``.
data_uri (str):
Optional. Data uri of the image. It is composed of four
parts: a prefix (data:), a MIME type indicating the type of
data, an optional base64 token if non-textual, and the data
itself: data:[][;base64],

This field is a member of `oneof`_ ``image_source``.
annotations (google.cloud.documentai_v1beta3.types.Document.Annotations):
Annotation of the image chunk field.
"""

blob_asset_id: str = proto.Field(
proto.STRING,
number=1,
oneof="image_source",
)
gcs_uri: str = proto.Field(
proto.STRING,
number=2,
oneof="image_source",
)
data_uri: str = proto.Field(
proto.STRING,
number=3,
oneof="image_source",
)
annotations: "Document.Annotations" = proto.Field(
proto.MESSAGE,
number=4,
message="Document.Annotations",
)

class TableChunkField(proto.Message):
r"""The table chunk field in the chunk.

Attributes:
annotations (google.cloud.documentai_v1beta3.types.Document.Annotations):
Annotation of the table chunk field.
"""

annotations: "Document.Annotations" = proto.Field(
proto.MESSAGE,
number=1,
message="Document.Annotations",
)

class ChunkField(proto.Message):
r"""The chunk field in the chunk. A chunk field could be one of
the various types (e.g. image, table) supported.

This message has `oneof`_ fields (mutually exclusive fields).
For each oneof, at most one member field can be set at the same time.
Setting any member of the oneof automatically clears all other
members.

.. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields

Attributes:
image_chunk_field (google.cloud.documentai_v1beta3.types.Document.ChunkedDocument.Chunk.ImageChunkField):
The image chunk field in the chunk.

This field is a member of `oneof`_ ``field_type``.
table_chunk_field (google.cloud.documentai_v1beta3.types.Document.ChunkedDocument.Chunk.TableChunkField):
The table chunk field in the chunk.

This field is a member of `oneof`_ ``field_type``.
"""

image_chunk_field: "Document.ChunkedDocument.Chunk.ImageChunkField" = (
proto.Field(
proto.MESSAGE,
number=1,
oneof="field_type",
message="Document.ChunkedDocument.Chunk.ImageChunkField",
)
)
table_chunk_field: "Document.ChunkedDocument.Chunk.TableChunkField" = (
proto.Field(
proto.MESSAGE,
number=2,
oneof="field_type",
message="Document.ChunkedDocument.Chunk.TableChunkField",
)
)

chunk_id: str = proto.Field(
proto.STRING,
number=1,
Expand Down Expand Up @@ -2180,13 +2396,50 @@ class ChunkPageFooter(proto.Message):
number=6,
message="Document.ChunkedDocument.Chunk.ChunkPageFooter",
)
chunk_fields: MutableSequence[
"Document.ChunkedDocument.Chunk.ChunkField"
] = proto.RepeatedField(
proto.MESSAGE,
number=7,
message="Document.ChunkedDocument.Chunk.ChunkField",
)

chunks: MutableSequence["Document.ChunkedDocument.Chunk"] = proto.RepeatedField(
proto.MESSAGE,
number=1,
message="Document.ChunkedDocument.Chunk",
)

class BlobAsset(proto.Message):
r"""Represents a blob asset. It's used to store the content of
the inline blob in this document, e.g. image bytes, such that it
can be referenced by other fields in the document via asset id.

Attributes:
asset_id (str):
Optional. The id of the blob asset.
content (bytes):
Optional. The content of the blob asset, e.g.
image bytes.
mime_type (str):
The mime type of the blob asset. An IANA published `media
type (MIME
type) <https://www.iana.org/assignments/media-types/media-types.xhtml>`__.
"""

asset_id: str = proto.Field(
proto.STRING,
number=1,
)
content: bytes = proto.Field(
proto.BYTES,
number=2,
)
mime_type: str = proto.Field(
proto.STRING,
number=3,
)

uri: str = proto.Field(
proto.STRING,
number=1,
Expand All @@ -2197,6 +2450,10 @@ class ChunkPageFooter(proto.Message):
number=2,
oneof="source",
)
docid: str = proto.Field(
proto.STRING,
number=15,
)
mime_type: str = proto.Field(
proto.STRING,
number=3,
Expand Down Expand Up @@ -2255,6 +2512,11 @@ class ChunkPageFooter(proto.Message):
number=18,
message=ChunkedDocument,
)
blob_assets: MutableSequence[BlobAsset] = proto.RepeatedField(
proto.MESSAGE,
number=19,
message=BlobAsset,
)


class RevisionRef(proto.Message):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ class LayoutConfig(proto.Message):
return_bounding_boxes (bool):
Optional. Whether to include bounding boxes
in layout parser processor response.
enable_image_annotation (bool):
Optional. Whether to include image
annotations in layout parser response.
enable_llm_layout_parsing (bool):
Optional. Whether to refine PDF layout using
LLM.
Expand Down Expand Up @@ -201,6 +204,10 @@ class ChunkingConfig(proto.Message):
proto.BOOL,
number=3,
)
enable_image_annotation: bool = proto.Field(
proto.BOOL,
number=4,
)
enable_llm_layout_parsing: bool = proto.Field(
proto.BOOL,
number=5,
Expand Down
Loading