Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: [contentwarehouse] Make Layout Parser generally available in V1 #5394

Merged
merged 6 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,160 @@ message Document {
repeated Provenance provenance = 3 [deprecated = true];
}

// Represents the parsed layout of a document as a collection of blocks that
// the document is divided into.
message DocumentLayout {
// Represents a block. A block could be one of the various types (text,
// table, list) supported.
message DocumentLayoutBlock {
// Represents where the block starts and ends in the document.
message LayoutPageSpan {
// Page where block starts in the document.
int32 page_start = 1;

// Page where block ends in the document.
int32 page_end = 2;
}

// Represents a text type block.
message LayoutTextBlock {
// Text content stored in the block.
string text = 1;

// Type of the text in the block. Available options are: `paragraph`,
// `subtitle`, `heading-1`, `heading-2`, `heading-3`, `heading-4`,
// `heading-5`, `header`, `footer`.
string type = 2;

// A text block could further have child blocks.
// Repeated blocks support further hierarchies and nested blocks.
repeated DocumentLayoutBlock blocks = 3;
}

// Represents a table type block.
message LayoutTableBlock {
// Header rows at the top of the table.
repeated LayoutTableRow header_rows = 1;

// Body rows containing main table content.
repeated LayoutTableRow body_rows = 2;

// Table caption/title.
string caption = 3;
}

// Represents a row in a table.
message LayoutTableRow {
// A table row is a list of table cells.
repeated LayoutTableCell cells = 1;
}

// Represents a cell in a table row.
message LayoutTableCell {
// A table cell is a list of blocks.
// Repeated blocks support further hierarchies and nested blocks.
repeated DocumentLayoutBlock blocks = 1;

// How many rows this cell spans.
int32 row_span = 2;

// How many columns this cell spans.
int32 col_span = 3;
}

// Represents a list type block.
message LayoutListBlock {
// List entries that constitute a list block.
repeated LayoutListEntry list_entries = 1;

// Type of the list_entries (if exist). Available options are `ordered`
// and `unordered`.
string type = 2;
}

// Represents an entry in the list.
message LayoutListEntry {
// A list entry is a list of blocks.
// Repeated blocks support further hierarchies and nested blocks.
repeated DocumentLayoutBlock blocks = 1;
}

oneof block {
// Block consisting of text content.
LayoutTextBlock text_block = 2;

// Block consisting of table content/structure.
LayoutTableBlock table_block = 3;

// Block consisting of list content/structure.
LayoutListBlock list_block = 4;
}

// ID of the block.
string block_id = 1;

// Page span of the block.
LayoutPageSpan page_span = 5;
}

// List of blocks in the document.
repeated DocumentLayoutBlock blocks = 1;
}

// Represents the chunks that the document is divided into.
message ChunkedDocument {
// Represents a chunk.
message Chunk {
// Represents where the chunk starts and ends in the document.
message ChunkPageSpan {
// Page where chunk starts in the document.
int32 page_start = 1;

// Page where chunk ends in the document.
int32 page_end = 2;
}

// Represents the page header associated with the chunk.
message ChunkPageHeader {
// Header in text format.
string text = 1;

// Page span of the header.
ChunkPageSpan page_span = 2;
}

// Represents the page footer associated with the chunk.
message ChunkPageFooter {
// Footer in text format.
string text = 1;

// Page span of the footer.
ChunkPageSpan page_span = 2;
}

// ID of the chunk.
string chunk_id = 1;

// Unused.
repeated string source_block_ids = 2;

// Text content of the chunk.
string content = 3;

// Page span of the chunk.
ChunkPageSpan page_span = 4;

// Page headers associated with the chunk.
repeated ChunkPageHeader page_headers = 5;

// Page footers associated with the chunk.
repeated ChunkPageFooter page_footers = 6;
}

// List of chunks.
repeated Chunk chunks = 1;
}

// Original source document from the user.
oneof source {
// Optional. Currently supports Google Cloud Storage URI of the form
Expand Down Expand Up @@ -944,4 +1098,10 @@ message Document {

// Placeholder. Revision history of this document.
repeated Revision revisions = 13;

// Parsed layout of the document.
DocumentLayout document_layout = 17;

// Document chunked based on chunking config.
ChunkedDocument chunked_document = 18;
}
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,23 @@ service DocumentProcessorService {

// Options for Process API
message ProcessOptions {
// Serving config for layout parser processor.
message LayoutConfig {
// Serving config for chunking.
message ChunkingConfig {
// Optional. The chunk sizes to use when splitting documents, in order of
// level.
int32 chunk_size = 1 [(google.api.field_behavior) = OPTIONAL];

// Optional. Whether or not to include ancestor headings when splitting.
bool include_ancestor_headings = 2
[(google.api.field_behavior) = OPTIONAL];
}

// Optional. Config for chunking in layout parser processor.
ChunkingConfig chunking_config = 1 [(google.api.field_behavior) = OPTIONAL];
}

// A list of individual page numbers.
message IndividualPageSelector {
// Optional. Indices of the pages (starting from 1).
Expand Down Expand Up @@ -356,6 +373,10 @@ message ProcessOptions {
// Returns error if set on other processor types.
OcrConfig ocr_config = 1;

// Optional. Only applicable to `LAYOUT_PARSER_PROCESSOR`.
// Returns error if set on other processor types.
LayoutConfig layout_config = 9 [(google.api.field_behavior) = OPTIONAL];

// Optional. Override the schema of the
// [ProcessorVersion][google.cloud.documentai.v1.ProcessorVersion]. Will
// return an Invalid Argument error if this field is set when the underlying
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ message ProcessorVersion {

// Output only. The model type of this processor version.
ModelType model_type = 15 [(google.api.field_behavior) = OUTPUT_ONLY];

// Output only. Reserved for future use.
bool satisfies_pzs = 16 [(google.api.field_behavior) = OUTPUT_ONLY];

// Output only. Reserved for future use.
bool satisfies_pzi = 17 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Contains the alias and the aliased resource name of processor version.
Expand Down Expand Up @@ -224,4 +230,10 @@ message Processor {
// The [KMS key](https://cloud.google.com/security-key-management) used for
// encryption and decryption in CMEK scenarios.
string kms_key_name = 8;

// Output only. Reserved for future use.
bool satisfies_pzs = 12 [(google.api.field_behavior) = OUTPUT_ONLY];

// Output only. Reserved for future use.
bool satisfies_pzi = 13 [(google.api.field_behavior) = OUTPUT_ONLY];
}
Loading
Loading