From 1fb464235a2d4fb399244db4ff2ee7a34202b2d6 Mon Sep 17 00:00:00 2001
From: Amanda Cameron <amanda@unstructured.io>
Date: Tue, 3 Oct 2023 09:40:34 -0700
Subject: [PATCH] chore: Table chunking (#1540)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change is adding to our `add_chunking_strategy` logic so that we
are able to chunk Table elements' `text` and `text_as_html` params. In
order to keep the functionality under the same `by_title` chunking
strategy we have renamed the `combine_under_n_chars` to
`max_characters`. It functions the same way for the combining elements
under Title's, as well as specifying a chunk size (in chars) for
TableChunk elements.

*renaming the variable to `max_characters` will also reflect the 'hard
max' we will implement for large elements in followup PRs


Additionally -> some lint changes snuck in when I ran `make tidy` hence
the minor changes in unrelated files :)

TODO:
✅ add unit tests
--> note: added where I could to unit tests! Some unit tests I just
clarified that the chunking strategy was now 'by_title' because we don't
have a file example that has Table elements to test the
'by_num_characters' chunking strategy
✅  update changelog

To manually test:
```
In [1]: filename="example-docs/example-10k.html"

In [2]: from unstructured.chunking.title import chunk_table_element

In [3]: from unstructured.partition.auto import partition

In [4]: elements = partition(filename)

# element at -2 happens to be a Table, and we'll get chunks of char size 4 here
In [5]: chunks = chunk_table_element(elements[-2], 4)

# examine text and text_as_html params
ln [6]: for c in chunks:
                    print(c.text)
                    print(c.metadata.text_as_html)
```

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
---
 CHANGELOG.md                                  |   4 +-
 test_unstructured/chunking/test_title.py      |  51 +++++--
 test_unstructured/partition/csv/test_csv.py   |  16 ++
 test_unstructured/partition/docx/test_docx.py |  43 +++++-
 test_unstructured/partition/epub/test_epub.py |  21 +++
 .../partition/markdown/test_md.py             |   2 +-
 test_unstructured/partition/msg/test_msg.py   |   2 +-
 test_unstructured/partition/odt/test_odt.py   |  23 ++-
 .../partition/pdf-image/test_image.py         |  19 +++
 .../partition/pdf-image/test_pdf.py           |   2 +-
 test_unstructured/partition/pptx/test_ppt.py  |   2 +-
 test_unstructured/partition/pptx/test_pptx.py |   4 +-
 .../partition/pypandoc/test_org.py            |   2 +-
 test_unstructured/partition/test_auto.py      |  87 ++++++++++-
 unstructured/__version__.py                   |   2 +-
 unstructured/chunking/title.py                | 142 +++++++++++-------
 unstructured/documents/elements.py            |  12 ++
 unstructured/ingest/interfaces.py             |   8 +-
 unstructured/partition/csv.py                 |   2 +
 unstructured/partition/xlsx.py                |   2 +
 unstructured/staging/weaviate.py              |   1 +
 21 files changed, 356 insertions(+), 91 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index febb223d00..0d06dc15f9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.19-dev6
+## 0.10.19-dev7
 
 ### Enhancements
 
@@ -6,6 +6,8 @@
 * **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title.
 * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself.
 * * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
+* **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=<n>` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length <n> characters. This means partitioned Table results are ready for use in downstream applications without any post processing.
+
 
 ### Features 
 
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
index 8ccfde5af8..bc8bdcc6b0 100644
--- a/test_unstructured/chunking/test_title.py
+++ b/test_unstructured/chunking/test_title.py
@@ -31,7 +31,7 @@ def test_split_elements_by_title_and_table():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    sections = _split_elements_by_title_and_table(elements, combine_under_n_chars=0)
+    sections = _split_elements_by_title_and_table(elements, combine_text_under_n_chars=0)
 
     assert sections == [
         [
@@ -75,7 +75,7 @@ def test_chunk_by_title():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    chunks = chunk_by_title(elements, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
 
     assert chunks == [
         CompositeElement(
@@ -112,7 +112,7 @@ def test_chunk_by_title_respects_section_change():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    chunks = chunk_by_title(elements, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
 
     assert chunks == [
         CompositeElement(
@@ -147,7 +147,7 @@ def test_chunk_by_title_separates_by_page_number():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    chunks = chunk_by_title(elements, multipage_sections=False, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, multipage_sections=False, combine_text_under_n_chars=0)
 
     assert chunks == [
         CompositeElement(
@@ -182,7 +182,7 @@ def test_chunk_by_title_groups_across_pages():
         Text("It is storming outside."),
         CheckBox(),
     ]
-    chunks = chunk_by_title(elements, multipage_sections=True, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
 
     assert chunks == [
         CompositeElement(
@@ -212,24 +212,32 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage():
         filename,
         chunking_strategy="by_title",
         multipage_sections=False,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     partitioned_elements_multipage_true_combine_chars_0 = partition_html(
         filename,
         chunking_strategy="by_title",
         multipage_sections=True,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     elements = partition_html(filename)
     cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
         elements,
         multipage_sections=False,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
         elements,
         multipage_sections=True,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     assert (
         partitioned_elements_multipage_false_combine_chars_0
@@ -244,7 +252,21 @@ def test_add_chunking_strategy_on_partition_html_respects_multipage():
     )
 
 
-def test_add_chunking_strategy_raises_error_for_invalid_n_chars():
+@pytest.mark.parametrize(
+    ("combine_text_under_n_chars", "new_after_n_chars", "max_characters"),
+    [
+        (-1, -1, -1),
+        (0, 0, 0),
+        (-5666, -6777, -8999),
+        (-5, 40, 50),
+        (50, 100, 20),
+    ],
+)
+def test_add_chunking_strategy_raises_error_for_invalid_n_chars(
+    combine_text_under_n_chars,
+    new_after_n_chars,
+    max_characters,
+):
     elements = [
         Title("A Great Day"),
         Text("Today is a great day."),
@@ -258,7 +280,12 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars():
         CheckBox(),
     ]
     with pytest.raises(ValueError):
-        chunk_by_title(elements, combine_under_n_chars=1, new_after_n_chars=0)
+        chunk_by_title(
+            elements,
+            combine_text_under_n_chars=combine_text_under_n_chars,
+            new_after_n_chars=new_after_n_chars,
+            max_characters=max_characters,
+        )
 
 
 def test_chunk_by_title_drops_extra_metadata():
@@ -335,7 +362,7 @@ def test_chunk_by_title_drops_extra_metadata():
         ),
     ]
 
-    chunks = chunk_by_title(elements, combine_under_n_chars=0)
+    chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
 
     assert str(chunks[0]) == str(
         CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/csv/test_csv.py
index 050c2c2567..3f3d5e4ae0 100644
--- a/test_unstructured/partition/csv/test_csv.py
+++ b/test_unstructured/partition/csv/test_csv.py
@@ -8,6 +8,7 @@
     EXPECTED_TEXT,
     EXPECTED_TEXT_WITH_EMOJI,
 )
+from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
 from unstructured.partition.csv import partition_csv
@@ -189,3 +190,18 @@ def test_partition_csv_with_json(filename, expected_text, expected_table):
     assert elements[0].metadata.filename == test_elements[0].metadata.filename
     for i in range(len(elements)):
         assert elements[i] == test_elements[i]
+
+
+def test_add_chunking_strategy_to_partition_csv_non_default():
+    filename = "example-docs/stanley-cups.csv"
+
+    elements = partition_csv(filename=filename)
+    chunk_elements = partition_csv(
+        filename,
+        chunking_strategy="by_title",
+        max_characters=9,
+        combine_text_under_n_chars=0,
+    )
+    chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0)
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
index 9c82c9d471..c622c390b4 100644
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@@ -18,6 +18,7 @@
     ListItem,
     NarrativeText,
     Table,
+    TableChunk,
     Text,
     Title,
 )
@@ -422,14 +423,6 @@ def test_partition_docx_with_json(mock_document, expected_elements, tmpdir):
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_docx(filename="example-docs/handbook-1p.docx"):
-    chunk_elements = partition_docx(filename, chunking_strategy="by_title")
-    elements = partition_docx(filename)
-    chunks = chunk_by_title(elements)
-    assert chunk_elements != elements
-    assert chunk_elements == chunks
-
-
 def test_parse_category_depth_by_style():
     partitioner = _DocxPartitioner("example-docs/category-level.docx", None, None, False, None)
 
@@ -489,3 +482,37 @@ def test_parse_category_depth_by_style_name():
 def test_parse_category_depth_by_style_ilvl():
     partitioner = _DocxPartitioner(None, None, None, False, None)
     assert partitioner._parse_category_depth_by_style_ilvl() == 0
+
+
+def test_add_chunking_strategy_on_partition_docx_default_args(
+    filename="example-docs/handbook-1p.docx",
+):
+    chunk_elements = partition_docx(filename, chunking_strategy="by_title")
+    elements = partition_docx(filename)
+    chunks = chunk_by_title(elements)
+
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
+
+
+def test_add_chunking_strategy_on_partition_docx(
+    filename="example-docs/fake-doc-emphasized-text.docx",
+):
+    chunk_elements = partition_docx(
+        filename,
+        chunking_strategy="by_title",
+        max_characters=9,
+        combine_text_under_n_chars=5,
+    )
+    elements = partition_docx(filename)
+    chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=5)
+    # remove the last element of the TableChunk list because it will be the leftover slice
+    # and not necessarily the max_characters len
+    table_chunks = [chunk for chunk in chunks if isinstance(chunk, TableChunk)][:-1]
+    other_chunks = [chunk for chunk in chunks if not isinstance(chunk, TableChunk)]
+    for table_chunk in table_chunks:
+        assert len(table_chunk.text) == 9
+    for chunk in other_chunks:
+        assert len(chunk.text) >= 5
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py
index 7d0e741899..991ec1991f 100644
--- a/test_unstructured/partition/epub/test_epub.py
+++ b/test_unstructured/partition/epub/test_epub.py
@@ -193,3 +193,24 @@ def test_add_chunking_strategy_on_partition_epub(
     chunks = chunk_by_title(elements)
     assert chunk_elements != elements
     assert chunk_elements == chunks
+
+
+def test_add_chunking_strategy_on_partition_epub_non_default(
+    filename=os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub"),
+):
+    elements = partition_epub(filename=filename)
+    chunk_elements = partition_epub(
+        filename,
+        chunking_strategy="by_title",
+        max_characters=5,
+        new_after_n_chars=5,
+        combine_text_under_n_chars=0,
+    )
+    chunks = chunk_by_title(
+        elements,
+        max_characters=5,
+        new_after_n_chars=5,
+        combine_text_under_n_chars=0,
+    )
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
diff --git a/test_unstructured/partition/markdown/test_md.py b/test_unstructured/partition/markdown/test_md.py
index 33d131b7a3..c73247998c 100644
--- a/test_unstructured/partition/markdown/test_md.py
+++ b/test_unstructured/partition/markdown/test_md.py
@@ -276,7 +276,7 @@ def test_partition_md_with_json(
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_md(
+def test_add_chunking_strategy_by_title_on_partition_md(
     filename="example-docs/README.md",
 ):
     elements = partition_md(filename=filename)
diff --git a/test_unstructured/partition/msg/test_msg.py b/test_unstructured/partition/msg/test_msg.py
index 6e179987a2..7678a6cda5 100644
--- a/test_unstructured/partition/msg/test_msg.py
+++ b/test_unstructured/partition/msg/test_msg.py
@@ -285,7 +285,7 @@ def test_partition_msg_with_pgp_encrypted_message(
     assert "Encrypted email detected" in caplog.text
 
 
-def test_add_chunking_strategy_on_partition_msg(
+def test_add_chunking_strategy_by_title_on_partition_msg(
     filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"),
 ):
     elements = partition_msg(filename=filename)
diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/odt/test_odt.py
index 9fe9b4b99d..982a11f9b4 100644
--- a/test_unstructured/partition/odt/test_odt.py
+++ b/test_unstructured/partition/odt/test_odt.py
@@ -2,7 +2,7 @@
 import pathlib
 
 from unstructured.chunking.title import chunk_by_title
-from unstructured.documents.elements import Table, Title
+from unstructured.documents.elements import Table, TableChunk, Title
 from unstructured.partition.json import partition_json
 from unstructured.partition.odt import partition_odt
 from unstructured.staging.base import elements_to_json
@@ -169,3 +169,24 @@ def test_add_chunking_strategy_on_partition_odt(
     chunks = chunk_by_title(elements)
     assert chunk_elements != elements
     assert chunk_elements == chunks
+
+
+def test_add_chunking_strategy_on_partition_odt_non_default():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    elements = partition_odt(filename=filename)
+    chunk_elements = partition_odt(
+        filename,
+        chunking_strategy="by_title",
+        max_characters=7,
+        combine_text_under_n_chars=5,
+    )
+    chunks = chunk_by_title(
+        elements,
+        max_characters=7,
+        combine_text_under_n_chars=5,
+    )
+    for chunk in chunk_elements:
+        if isinstance(chunk, TableChunk):
+            assert len(chunk.text) <= 7
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py
index e2c9496356..721eed64dd 100644
--- a/test_unstructured/partition/pdf-image/test_image.py
+++ b/test_unstructured/partition/pdf-image/test_image.py
@@ -460,6 +460,25 @@ def test_add_chunking_strategy_on_partition_image(
     assert chunk_elements == chunks
 
 
+def test_add_chunking_strategy_on_partition_image_hi_res(
+    filename="example-docs/layout-parser-paper-with-table.jpg",
+):
+    elements = image.partition_image(
+        filename=filename,
+        strategy="hi_res",
+        infer_table_structure=True,
+    )
+    chunk_elements = image.partition_image(
+        filename,
+        strategy="hi_res",
+        infer_table_structure=True,
+        chunking_strategy="by_title",
+    )
+    chunks = chunk_by_title(elements)
+    assert chunk_elements != elements
+    assert chunk_elements == chunks
+
+
 def test_partition_image_uses_model_name():
     with mock.patch.object(
         pdf,
diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py
index e14a793a2a..37af371598 100644
--- a/test_unstructured/partition/pdf-image/test_pdf.py
+++ b/test_unstructured/partition/pdf-image/test_pdf.py
@@ -838,7 +838,7 @@ def test_partition_pdf_with_ocr_coordinates_are_not_nan_from_file(
                     assert point[1] is not math.nan
 
 
-def test_add_chunking_strategy_on_partition_pdf(
+def test_add_chunking_strategy_by_title_on_partition_pdf(
     filename="example-docs/layout-parser-paper-fast.pdf",
 ):
     elements = pdf.partition_pdf(filename=filename)
diff --git a/test_unstructured/partition/pptx/test_ppt.py b/test_unstructured/partition/pptx/test_ppt.py
index 1662002ddd..3750e0e9c6 100644
--- a/test_unstructured/partition/pptx/test_ppt.py
+++ b/test_unstructured/partition/pptx/test_ppt.py
@@ -174,7 +174,7 @@ def test_partition_ppt_with_json(
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_ppt(
+def test_add_chunking_strategy_by_title_on_partition_ppt(
     filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt"),
 ):
     elements = partition_ppt(filename=filename)
diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py
index 3540c020e7..37e9b7ce3e 100644
--- a/test_unstructured/partition/pptx/test_pptx.py
+++ b/test_unstructured/partition/pptx/test_pptx.py
@@ -371,8 +371,8 @@ def test_partition_pptx_with_json():
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_pptx():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
+def test_add_chunking_strategy_by_title_on_partition_pptx():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "science-exploration-1p.pptx")
     elements = partition_pptx(filename=filename)
     chunk_elements = partition_pptx(filename, chunking_strategy="by_title")
     chunks = chunk_by_title(elements)
diff --git a/test_unstructured/partition/pypandoc/test_org.py b/test_unstructured/partition/pypandoc/test_org.py
index 9017c5e86f..81ad6d4ed2 100644
--- a/test_unstructured/partition/pypandoc/test_org.py
+++ b/test_unstructured/partition/pypandoc/test_org.py
@@ -136,7 +136,7 @@ def test_partition_org_with_json(filename="example-docs/README.org"):
         assert elements[i] == test_elements[i]
 
 
-def test_add_chunking_strategy_on_partition_org(
+def test_add_chunking_strategy_by_title_on_partition_org(
     filename="example-docs/README.org",
 ):
     elements = partition_org(filename=filename)
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index dcacf01ba2..a0c907aad3 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -17,6 +17,7 @@
     ListItem,
     NarrativeText,
     Table,
+    TableChunk,
     Text,
     Title,
 )
@@ -937,37 +938,45 @@ def test_get_partition_with_extras_prompts_for_install_if_missing():
 
 def test_add_chunking_strategy_on_partition_auto():
     filename = "example-docs/example-10k-1p.html"
-    chunk_elements = partition(filename, chunking_strategy="by_title")
     elements = partition(filename)
+    chunk_elements = partition(filename, chunking_strategy="by_title")
     chunks = chunk_by_title(elements)
     assert chunk_elements != elements
     assert chunk_elements == chunks
 
 
-def test_add_chunking_strategy_on_partition_auto_respects_multipage():
+def test_add_chunking_strategy_title_on_partition_auto_respects_multipage():
     filename = "example-docs/example-10k-1p.html"
     partitioned_elements_multipage_false_combine_chars_0 = partition(
         filename,
         chunking_strategy="by_title",
         multipage_sections=False,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     partitioned_elements_multipage_true_combine_chars_0 = partition(
         filename,
         chunking_strategy="by_title",
         multipage_sections=True,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     elements = partition(filename)
     cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
         elements,
         multipage_sections=False,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
         elements,
         multipage_sections=True,
-        combine_under_n_chars=0,
+        combine_text_under_n_chars=0,
+        new_after_n_chars=300,
+        max_characters=400,
     )
     assert (
         partitioned_elements_multipage_false_combine_chars_0
@@ -980,3 +989,69 @@ def test_add_chunking_strategy_on_partition_auto_respects_multipage():
     assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
         partitioned_elements_multipage_false_combine_chars_0,
     )
+
+
+def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
+    filename = "example-docs/example-10k-1p.html"
+
+    # default chunk size in chars is 200
+    partitioned_table_elements_200_chars = [
+        e
+        for e in partition(
+            filename,
+            chunking_strategy="by_title",
+            max_characters=200,
+            combine_text_under_n_chars=5,
+        )
+        if isinstance(e, (Table, TableChunk))
+    ]
+
+    partitioned_table_elements_5_chars = [
+        e
+        for e in partition(
+            filename,
+            chunking_strategy="by_title",
+            max_characters=5,
+            combine_text_under_n_chars=5,
+        )
+        if isinstance(e, (Table, TableChunk))
+    ]
+
+    elements = partition(filename)
+
+    table_elements = [e for e in elements if isinstance(e, Table)]
+
+    assert len(partitioned_table_elements_5_chars) != len(table_elements)
+    assert len(partitioned_table_elements_200_chars) != len(table_elements)
+
+    assert len(partitioned_table_elements_5_chars[0].text) == 5
+    assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
+
+    # the first table element is under 200 chars so doesn't get chunked!
+    assert table_elements[0] == partitioned_table_elements_200_chars[0]
+    assert len(partitioned_table_elements_200_chars[0].text) < 200
+    assert len(partitioned_table_elements_200_chars[1].text) == 200
+    assert len(partitioned_table_elements_200_chars[1].metadata.text_as_html) == 200
+
+
+def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
+    filename = "example-docs/example-10k-1p.html"
+
+    # default chunk size in chars is 200
+    partitioned_table_elements_200_chars = [
+        e
+        for e in partition(
+            filename,
+            chunking_strategy="by_num_characters",
+        )
+        if isinstance(e, Table)
+    ]
+
+    i = 0
+    for table in partitioned_table_elements_200_chars:
+        # have to reset the counter to 0 here when we encounter a Table element
+        if isinstance(table, Table):
+            i = 0
+        if i > 0 and isinstance(table, TableChunk):
+            assert table.metadata.is_continuation is True
+            i += 1
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 5f8fd628c9..a4cf981717 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.19-dev6"  # pragma: no cover
+__version__ = "0.10.19-dev7"  # pragma: no cover
diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py
index 8fd38d62f3..0c5bde799c 100644
--- a/unstructured/chunking/title.py
+++ b/unstructured/chunking/title.py
@@ -1,6 +1,7 @@
+import copy
 import functools
 import inspect
-from typing import Any, Callable, Dict, List, TypeVar
+from typing import Any, Callable, Dict, List, Optional, TypeVar, Union
 
 from typing_extensions import ParamSpec
 
@@ -9,96 +10,130 @@
     Element,
     ElementMetadata,
     Table,
+    TableChunk,
     Text,
     Title,
 )
 
 
+def chunk_table_element(
+    element: Table,
+    max_characters: Optional[int] = 500,
+) -> List[Union[Table, TableChunk]]:
+    text = element.text
+    html = getattr(element, "text_as_html", None)
+
+    if len(text) <= max_characters and (  # type: ignore
+        html is None or len(html) <= max_characters  # type: ignore
+    ):
+        return [element]
+
+    chunks: List[Union[Table, TableChunk]] = []
+    metadata = copy.copy(element.metadata)
+    is_continuation = False
+
+    while text or html:
+        text_chunk, text = text[:max_characters], text[max_characters:]
+        table_chunk = TableChunk(text=text_chunk, metadata=copy.copy(metadata))
+
+        if html:
+            html_chunk, html = html[:max_characters], html[max_characters:]
+            table_chunk.metadata.text_as_html = html_chunk
+
+        if is_continuation:
+            table_chunk.metadata.is_continuation = True
+
+        chunks.append(table_chunk)
+        is_continuation = True
+
+    return chunks
+
+
 def chunk_by_title(
     elements: List[Element],
     multipage_sections: bool = True,
-    combine_under_n_chars: int = 500,
-    new_after_n_chars: int = 1500,
+    combine_text_under_n_chars: int = 500,
+    new_after_n_chars: int = 500,
+    max_characters: int = 500,
 ) -> List[Element]:
     """Uses title elements to identify sections within the document for chunking. Splits
     off into a new section when a title is detected or if metadata changes, which happens
     when page numbers or sections change. Cuts off sections once they have exceeded
-    a character length of new_after_n_chars.
+    a character length of max_characters.
 
     Parameters
     ----------
     elements
-        A list of unstructured elements. Usually the ouput of a partition functions.
+        A list of unstructured elements. Usually the output of a partition functions.
     multipage_sections
         If True, sections can span multiple pages. Defaults to True.
-    combine_under_n_chars
+    combine_text_under_n_chars
         Combines elements (for example a series of titles) until a section reaches
         a length of n characters.
     new_after_n_chars
-        Cuts off new sections once they reach a length of n characters
+        Cuts off new sections once they reach a length of n characters (soft max)
+    max_characters
+        Chunks table elements text and text_as_html into chunks of length n characters (hard max)
+        TODO: (amanda) extend to other elements
     """
     if (
-        combine_under_n_chars is not None
+        combine_text_under_n_chars is not None
         and new_after_n_chars is not None
+        and max_characters is not None
         and (
-            combine_under_n_chars > new_after_n_chars
-            or combine_under_n_chars < 0
+            combine_text_under_n_chars > new_after_n_chars
+            or combine_text_under_n_chars < 0
             or new_after_n_chars < 0
+            or max_characters <= 0
+            or combine_text_under_n_chars > max_characters
         )
     ):
         raise ValueError(
-            "Invalid values for combine_under_n_chars and/or new_after_n_chars.",
+            "Invalid values for combine_text_under_n_chars and/or max_characters.",
         )
 
     chunked_elements: List[Element] = []
     sections = _split_elements_by_title_and_table(
         elements,
         multipage_sections=multipage_sections,
-        combine_under_n_chars=combine_under_n_chars,
+        combine_text_under_n_chars=combine_text_under_n_chars,
         new_after_n_chars=new_after_n_chars,
     )
-
     for section in sections:
         if not section:
             continue
-        if not isinstance(section[0], Text) or isinstance(section[0], Table):
-            chunked_elements.extend(section)
 
-        elif isinstance(section[0], Text):
-            text = ""
-            metadata = section[0].metadata
+        first_element = section[0]
 
-            for i, element in enumerate(section):
-                if isinstance(element, Text):
-                    text += "\n\n" if text else ""
-                    start_char = len(text)
-                    text += element.text
+        if not isinstance(first_element, Text):
+            chunked_elements.extend(section)
+            continue
 
-                for attr, value in vars(element.metadata).items():
-                    if not isinstance(value, list):
-                        continue
+        elif isinstance(first_element, Table):
+            chunked_elements.extend(chunk_table_element(first_element, max_characters))
+            continue
 
-                    _value = getattr(metadata, attr, [])
-                    if _value is None:
-                        _value = []
+        text = ""
+        metadata = first_element.metadata
+        start_char = 0
+        for element in section:
+            if isinstance(element, Text):
+                text += "\n\n" if text else ""
+                start_char = len(text)
+                text += element.text
+            for attr, value in vars(element.metadata).items():
+                if isinstance(value, list):
+                    _value = getattr(metadata, attr, []) or []
 
                     if attr == "regex_metadata":
                         for item in value:
                             item["start"] += start_char
                             item["end"] += start_char
 
-                    if i > 0:
-                        # NOTE(newelh): Previously, _value was extended with value.
-                        # This caused a memory error if the content was a list of strings
-                        # with a large number of elements -- doubling the list size each time.
-                        # This now instead ensures that the _value list is unique and updated.
-                        for item in value:
-                            if item not in _value:
-                                _value.append(item)
-
-                        setattr(metadata, attr, _value)
+                    _value.extend(item for item in value if item not in _value)
+                    setattr(metadata, attr, _value)
 
-            chunked_elements.append(CompositeElement(text=text, metadata=metadata))
+        chunked_elements.append(CompositeElement(text=text, metadata=metadata))
 
     return chunked_elements
 
@@ -106,8 +141,8 @@ def chunk_by_title(
 def _split_elements_by_title_and_table(
     elements: List[Element],
     multipage_sections: bool = True,
-    combine_under_n_chars: int = 500,
-    new_after_n_chars: int = 1500,
+    combine_text_under_n_chars: int = 500,
+    new_after_n_chars: int = 500,
 ) -> List[List[Element]]:
     sections: List[List[Element]] = []
     section: List[Element] = []
@@ -123,11 +158,11 @@ def _split_elements_by_title_and_table(
             )
 
         section_length = sum([len(str(element)) for element in section])
-        new_section = (isinstance(element, Title) and section_length > combine_under_n_chars) or (
-            not metadata_matches or section_length > new_after_n_chars
-        )
+        new_section = (
+            isinstance(element, Title) and section_length > combine_text_under_n_chars
+        ) or (not metadata_matches or section_length > new_after_n_chars)
 
-        if isinstance(element, Table) or not isinstance(element, Text):
+        if not isinstance(element, Text) or isinstance(element, Table):
             sections.append(section)
             sections.append([element])
             section = []
@@ -185,7 +220,7 @@ def add_chunking_strategy() -> Callable[[Callable[_P, List[Element]]], Callable[
     """Decorator for chuncking text. Uses title elements to identify sections within the document
     for chunking. Splits off a new section when a title is detected or if metadata changes,
     which happens when page numbers or sections change. Cuts off sections once they have exceeded
-    a character length of new_after_n_chars."""
+    a character length of max_characters."""
 
     def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]:
         if func.__doc__ and (
@@ -199,11 +234,15 @@ def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]:
                 + "\n\tAdditional Parameters:"
                 + "\n\t\tmultipage_sections"
                 + "\n\t\t\tIf True, sections can span multiple pages. Defaults to True."
-                + "\n\t\tcombine_under_n_chars"
+                + "\n\t\tcombine_text_under_n_chars"
                 + "\n\t\t\tCombines elements (for example a series of titles) until a section"
                 + "\n\t\t\treaches a length of n characters."
                 + "\n\t\tnew_after_n_chars"
-                + "\n\t\t\tCuts off new sections once they reach a length of n characters"
+                + "\n\t\t\t Cuts off new sections once they reach a length of n characters"
+                + "\n\t\t\t a soft max."
+                + "\n\t\tmax_characters"
+                + "\n\t\t\tChunks table elements text and text_as_html into chunks"
+                + "\n\t\t\tof length n characters, a hard max."
             )
 
         @functools.wraps(func)
@@ -218,8 +257,9 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
                 elements = chunk_by_title(
                     elements,
                     multipage_sections=params.get("multipage_sections", True),
-                    combine_under_n_chars=params.get("combine_under_n_chars", 500),
-                    new_after_n_chars=params.get("new_after_n_chars", 1500),
+                    combine_text_under_n_chars=params.get("combine_text_under_n_chars", 500),
+                    new_after_n_chars=params.get("new_after_n_chars", 500),
+                    max_characters=params.get("max_characters", 500),
                 )
             return elements
 
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
index f051e1b4f6..75c15a3e36 100644
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@@ -185,6 +185,10 @@ class ElementMetadata:
     # Metadata extracted via regex
     regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None
 
+    # Chunking metadata fields
+    num_characters: Optional[int] = None
+    is_continuation: Optional[bool] = None
+
     # Detection Model Class Probabilities from Unstructured-Inference Hi-Res
     detection_class_prob: Optional[float] = None
 
@@ -566,6 +570,14 @@ class Table(Text):
     pass
 
 
+class TableChunk(Table):
+    """An element for capturing chunks of tables."""
+
+    category = "Table"
+
+    pass
+
+
 class Header(Text):
     """An element for capturing document headers."""
 
diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py
index c76fdfb783..caefa50afd 100644
--- a/unstructured/ingest/interfaces.py
+++ b/unstructured/ingest/interfaces.py
@@ -83,16 +83,16 @@ def get_embedder(self) -> BaseEmbeddingEncoder:
 class ChunkingConfig(BaseConfig):
     chunk_elements: bool = False
     multipage_sections: bool = True
-    combine_under_n_chars: int = 500
-    new_after_n_chars: int = 1500
+    combine_text_under_n_chars: int = 500
+    max_characters: int = 1500
 
     def chunk(self, elements: t.List[Element]) -> t.List[Element]:
         if self.chunk_elements:
             return chunk_by_title(
                 elements=elements,
                 multipage_sections=self.multipage_sections,
-                combine_under_n_chars=self.combine_under_n_chars,
-                new_after_n_chars=self.new_after_n_chars,
+                combine_text_under_n_chars=self.combine_text_under_n_chars,
+                max_characters=self.max_characters,
             )
         else:
             return elements
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
index 6a7314de03..2528f321cd 100644
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from lxml.html.soupparser import fromstring as soupparser_fromstring
 
+from unstructured.chunking.title import add_chunking_strategy
 from unstructured.documents.elements import (
     Element,
     ElementMetadata,
@@ -21,6 +22,7 @@
 
 @process_metadata()
 @add_metadata_with_filetype(FileType.CSV)
+@add_chunking_strategy()
 def partition_csv(
     filename: Optional[str] = None,
     file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index 2f4538210f..ebffd6cdf9 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from lxml.html.soupparser import fromstring as soupparser_fromstring
 
+from unstructured.chunking.title import add_chunking_strategy
 from unstructured.documents.elements import (
     Element,
     ElementMetadata,
@@ -21,6 +22,7 @@
 
 @process_metadata()
 @add_metadata_with_filetype(FileType.XLSX)
+@add_chunking_strategy()
 def partition_xlsx(
     filename: Optional[str] = None,
     file: Optional[Union[IO[bytes], SpooledTemporaryFile]] = None,
diff --git a/unstructured/staging/weaviate.py b/unstructured/staging/weaviate.py
index c6efc80bd4..4a4e15276c 100644
--- a/unstructured/staging/weaviate.py
+++ b/unstructured/staging/weaviate.py
@@ -15,6 +15,7 @@ class Properties(TypedDict):
     "regex_metadata",
     "emphasized_texts",
     "detection_class_prob",
+    "is_continuation",
 )