diff --git a/.github/actions/base-cache/action.yml b/.github/actions/base-cache/action.yml index 2f5ff95c79..05b1ddc227 100644 --- a/.github/actions/base-cache/action.yml +++ b/.github/actions/base-cache/action.yml @@ -22,7 +22,7 @@ runs: lookup-only: ${{ inputs.check-only }} - name: Set up Python ${{ inputs.python-version }} if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true' - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ inputs.python-version }} - name: Setup virtual environment (no cache hit) @@ -30,7 +30,9 @@ runs: shell: bash run: | python${{ inputs.python-version }} -m pip install --upgrade virtualenv - python${{ inputs.python-version }} -m venv .venv + if [ ! -d ".venv" ]; then + python${{ inputs.python-version }} -m venv .venv + fi source .venv/bin/activate [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA" if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then @@ -38,6 +40,7 @@ runs: python -m pip install --upgrade setuptools fi make install-ci + make install-nltk-models - name: Save Cache if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true' id: virtualenv-cache-save diff --git a/.github/actions/base-ingest-cache/action.yml b/.github/actions/base-ingest-cache/action.yml index fe97c35555..dc9d5105a2 100644 --- a/.github/actions/base-ingest-cache/action.yml +++ b/.github/actions/base-ingest-cache/action.yml @@ -18,25 +18,30 @@ runs: path: | .venv nltk_data - key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }} + key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }} lookup-only: ${{ inputs.check-only }} - - name: Restore base virtual environment - if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' - uses: ./.github/actions/base-cache - with: - python-version: ${{ inputs.python-version }} - name: Set up Python ${{ inputs.python-version }} if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ inputs.python-version }} - name: Setup virtual environment (no cache hit) if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' shell: bash run: | - python${{ inputs.python-version }} -m venv .venv + python${{ inputs.python-version }} -m pip install --upgrade virtualenv + if [ ! -d ".venv" ]; then + python${{ inputs.python-version }} -m venv .venv + fi source .venv/bin/activate - make install-all-ingest + if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then + python -m ensurepip --upgrade + python -m pip install --upgrade setuptools + fi + make install-ci + make install-nltk-models + make install-all-docs + make install-ingest - name: Save Ingest Cache if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' id: ingest-virtualenv-cache-save @@ -45,5 +50,5 @@ runs: path: | .venv nltk_data - key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }} + key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5fa655250..88fe84680b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,14 +12,15 @@ permissions: id-token: write contents: read +env: + NLTK_DATA: ${{ github.workspace }}/nltk_data + jobs: setup: strategy: matrix: python-version: ["3.9","3.10","3.11", "3.12"] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data steps: - uses: actions/checkout@v4 - uses: ./.github/actions/base-cache @@ -55,12 +56,29 @@ jobs: - name: Install all extras run: make check-extras + check-licenses: + strategy: + matrix: + python-version: [ "3.12" ] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + # NOTE(robinson) - dependencies are installed first because liccheck + # produces an error if there is a a mismatch between the dep version + # in the requirements file and the dep version in site packages + - name: Install all doc and test dependencies + run: | + make install-ci + make check-licenses + lint: strategy: matrix: python-version: ["3.9","3.10","3.11"] - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data runs-on: ubuntu-latest needs: [setup, changelog] steps: @@ -76,6 +94,7 @@ jobs: - name: Lint run: | source .venv/bin/activate + make install-ci make check shellcheck: @@ -100,8 +119,6 @@ jobs: matrix: python-version: ["3.9","3.10","3.11", "3.12"] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup, lint] steps: - uses: actions/checkout@v4 @@ -135,46 +152,11 @@ jobs: make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true make check-coverage - test_chipper: - strategy: - matrix: - python-version: ["3.10"] - runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data - UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} - needs: [setup, lint] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Setup virtual environment - uses: ./.github/actions/base-cache - with: - python-version: ${{ matrix.python-version }} - - name: Test - env: - UNS_API_KEY: ${{ secrets.UNS_API_KEY }} - run: | - source .venv/bin/activate - sudo apt-get update - sudo apt-get install -y poppler-utils - make install-pandoc install-test - sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 - sudo apt-get update - sudo apt-get install -y tesseract-ocr tesseract-ocr-kor - tesseract --version - make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true - test_unit_no_extras: strategy: matrix: python-version: ["3.10"] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup, lint] steps: - uses: actions/checkout@v4 @@ -191,6 +173,8 @@ jobs: UNS_API_KEY: ${{ secrets.UNS_API_KEY }} run: | source .venv/bin/activate + make install-ci + make install-nltk-models make test-no-extras CI=true test_unit_dependency_extras: @@ -198,10 +182,8 @@ jobs: strategy: matrix: python-version: ["3.10"] - extra: ["csv", "docx", "odt", "markdown", "pypandoc", "msg", "pdf-image", "pptx", "xlsx"] + extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup, lint, test_unit_no_extras] steps: - uses: actions/checkout@v4 @@ -235,6 +217,7 @@ jobs: sudo apt-get update sudo apt-get install -y tesseract-ocr tesseract-ocr-kor tesseract --version + make install-${{ matrix.extra }} make test-extra-${{ matrix.extra }} CI=true setup_ingest: @@ -242,50 +225,19 @@ jobs: matrix: python-version: [ "3.9","3.10" ] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup] steps: - uses: actions/checkout@v4 - uses: ./.github/actions/base-ingest-cache with: python-version: ${{ matrix.python-version }} - - test_ingest_unit: - strategy: - matrix: - python-version: [ "3.9","3.10" ] - runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data - needs: [ setup_ingest, lint ] - steps: - # actions/checkout MUST come before auth - - uses: 'actions/checkout@v4' - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Get full Python version - id: full-python-version - run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT - - name: Setup virtual environment - uses: ./.github/actions/base-ingest-cache - with: - python-version: ${{ matrix.python-version }} - - name: Test Ingest (unit) - run: | - source .venv/bin/activate - PYTHONPATH=. pytest test_unstructured_ingest/unit - + check-only: 'true' test_ingest_src: strategy: matrix: python-version: ["3.9","3.10"] runs-on: ubuntu-latest-m - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest, lint] steps: # actions/checkout MUST come before auth @@ -348,9 +300,10 @@ jobs: PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}} ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} - TABLE_OCR: "tesseract" + MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}} OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" CI: "true" + PYTHON: python${{ matrix.python-version }} run: | source .venv/bin/activate sudo apt-get update @@ -362,111 +315,17 @@ jobs: sudo apt-get install -y tesseract-ocr-kor sudo apt-get install diffstat tesseract --version + make install-all-docs + make install-ingest ./test_unstructured_ingest/test-ingest-src.sh - test_ingest_dest: - environment: ci - strategy: - matrix: - python-version: ["3.9","3.10"] - runs-on: ubuntu-latest-m - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data - needs: [setup_ingest, lint] - steps: - # actions/checkout MUST come before auth - - uses: 'actions/checkout@v4' - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Get full Python version - id: full-python-version - run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT - - name: Setup virtual environment - uses: ./.github/actions/base-ingest-cache - with: - python-version: ${{ matrix.python-version }} - - name: Setup docker-compose - uses: KengoTODA/actions-setup-docker-compose@v1 - with: - version: '2.22.0' - - name: Test (end-to-end) - env: - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }} - S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }} - AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} - AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} - BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} - DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} - DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} - DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} - GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - MONGODB_URI: ${{ secrets.MONGODB_URI }} - MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }} - AZURE_DEST_CONNECTION_STR: ${{ secrets.AZURE_DEST_CONNECTION_STR }} - PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}} - VECTARA_OAUTH_CLIENT_ID: ${{secrets.VECTARA_OAUTH_CLIENT_ID}} - VECTARA_OAUTH_SECRET: ${{secrets.VECTARA_OAUTH_SECRET}} - VECTARA_CUSTOMER_ID: ${{secrets.VECTARA_CUSTOMER_ID}} - ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} - ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} - CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}} - DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}} - DATABRICKS_USERNAME: ${{secrets.DATABRICKS_USERNAME}} - DATABRICKS_PASSWORD: ${{secrets.DATABRICKS_PASSWORD}} - DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}} - TABLE_OCR: "tesseract" - OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" - CI: "true" - run: | - source .venv/bin/activate - sudo apt-get update - sudo apt-get install -y libmagic-dev poppler-utils libreoffice - make install-pandoc - sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 - sudo apt-get update - sudo apt-get install -y tesseract-ocr - sudo apt-get install -y tesseract-ocr-kor - sudo apt-get install diffstat - tesseract --version - ./test_unstructured_ingest/test-ingest-dest.sh - - test_ingest_help: - environment: ci - strategy: - matrix: - python-version: ["3.9","3.10","3.11", "3.12"] - runs-on: ubuntu-latest - needs: [setup_ingest, lint] - steps: - - uses: 'actions/checkout@v4' - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Setup virtual environment - uses: ./.github/actions/base-ingest-cache - with: - python-version: ${{ matrix.python-version }} - - name: Validate --help - run: | - source .venv/bin/activate - ./test_unstructured_ingest/test-help.sh - - test_unstructured_api_unit: strategy: matrix: # NOTE(yuming): Unstructured API only use Python 3.10 python-version: ["3.10"] runs-on: ubuntu-latest - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup, lint] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index a2b7de888e..632f4ac3e1 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -35,14 +35,12 @@ jobs: update-fixtures-and-pr: runs-on: ubuntu-latest-m - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest] steps: # actions/checkout MUST come before auth - uses: 'actions/checkout@v4' - name: Set up Python ${{ env.PYTHON_VERSION }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ env.PYTHON_VERSION }} - name: Get full Python version @@ -96,7 +94,8 @@ jobs: AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }} - TABLE_OCR: "tesseract" + ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} + ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" OVERWRITE_FIXTURES: "true" CI: "true" @@ -108,6 +107,7 @@ jobs: sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 sudo apt-get install -y tesseract-ocr sudo apt-get install -y tesseract-ocr-kor + sudo apt-get install diffstat tesseract --version ./test_unstructured_ingest/test-ingest-src.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f77c6a646..ff86e84aa2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,17 +1,319 @@ -## 0.14.11-dev6 +## 0.16.6-dev2 ### Enhancements +- **Every
\n", - " | text | \n", - "prediction | \n", - "prediction_agent | \n", - "annotation | \n", - "annotation_agent | \n", - "id | \n", - "metadata | \n", - "status | \n", - "event_timestamp | \n", - "metrics | \n", - "search_keywords | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "Russian forces are completing the reinforcemen... | \n", - "None | \n", - "None | \n", - "Russian forces are setting conditions to envel... | \n", - "None | \n", - "1c728c08b07bf47c5ec573bf78350c50 | \n", - "{} | \n", - "Validated | \n", - "None | \n", - "None | \n", - "None | \n", - "
1 | \n", - "Russian forces resumed offensive operations in... | \n", - "None | \n", - "None | \n", - "Russian forces resumed offensive operations ag... | \n", - "None | \n", - "e03b12744a53d8393620c617b5d82f27 | \n", - "{} | \n", - "Validated | \n", - "None | \n", - "None | \n", - "None | \n", - "
2 | \n", - "The Russian military has continued its unsucce... | \n", - "None | \n", - "None | \n", - "Russian forces opened a new line of advance fr... | \n", - "None | \n", - "1852425c2dc32a35274b2ac112b43221 | \n", - "{} | \n", - "Validated | \n", - "None | \n", - "None | \n", - "None | \n", - "
3 | \n", - "Russian forces continue their focus on encircl... | \n", - "None | \n", - "None | \n", - "Russian forces have advanced rapidly on the ea... | \n", - "None | \n", - "9f094b6a9d30b9529aa630d818d143ae | \n", - "{} | \n", - "Validated | \n", - "None | \n", - "None | \n", - "None | \n", - "
4 | \n", - "Russian forces remain deployed in the position... | \n", - "None | \n", - "None | \n", - "Russian forces conducted no major offensive op... | \n", - "None | \n", - "d4c88cb002d4fa75d7273c3206cbde93 | \n", - "{} | \n", - "Validated | \n", - "None | \n", - "None | \n", - "None | \n", - "
Epoch | \n", - "Training Loss | \n", - "Validation Loss | \n", - "
---|---|---|
1 | \n", - "No log | \n", - "3.969428 | \n", - "
"
- ],
- "text/plain": [
- " "
- ],
- "text/plain": [
- "\n",
- " \n",
- "
\n",
- "\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " topic_0 \n",
- " topic_1 \n",
- " topic_2 \n",
- " topic_3 \n",
- " topic_4 \n",
- " topic_5 \n",
- " topic_6 \n",
- " topic_7 \n",
- " topic_8 \n",
- " topic_9 \n",
- " \n",
- " \n",
- " 0 \n",
- " neural \n",
- " language \n",
- " state \n",
- " function \n",
- " cost \n",
- " publication \n",
- " graph \n",
- " llama \n",
- " tangkhul \n",
- " want \n",
- " \n",
- " \n",
- " 1 \n",
- " network \n",
- " natural \n",
- " rnn \n",
- " distribution \n",
- " function \n",
- " april \n",
- " computation \n",
- " like \n",
- " compound \n",
- " edu \n",
- " \n",
- " \n",
- " 2 \n",
- " function \n",
- " model \n",
- " memory \n",
- " output \n",
- " sgd \n",
- " syst \n",
- " node \n",
- " south \n",
- " root \n",
- " dsontagcoursesinferenceslidespseudolikelihoodn... \n",
- " \n",
- " \n",
- " 3 \n",
- " networks \n",
- " word \n",
- " vector \n",
- " class \n",
- " training \n",
- " technol \n",
- " nodes \n",
- " animal \n",
- " morphological \n",
- " regardlessly \n",
- " \n",
- " \n",
- " 4 \n",
- " one \n",
- " planning \n",
- " input \n",
- " tanh \n",
- " expected \n",
- " date \n",
- " backward \n",
- " america \n",
- " verbs \n",
- " satisfied \n",
- " \n",
- " \n",
- " 5 \n",
- " input \n",
- " words \n",
- " network \n",
- " data \n",
- " optimization \n",
- " vol \n",
- " function \n",
- " translation \n",
- " noun \n",
- " november \n",
- " \n",
- " \n",
- " 6 \n",
- " vector \n",
- " based \n",
- " recurrent \n",
- " yˆ \n",
- " algorithm \n",
- " intell \n",
- " backpropagation \n",
- " french \n",
- " roots \n",
- " tune \n",
- " \n",
- " \n",
- " 7 \n",
- " language \n",
- " processing \n",
- " sequence \n",
- " loss \n",
- " set \n",
- " acm \n",
- " algorithm \n",
- " cute \n",
- " adjectives \n",
- " return \n",
- " \n",
- " \n",
- " 8 \n",
- " model \n",
- " models \n",
- " neural \n",
- " activation \n",
- " validation \n",
- " article \n",
- " parameters \n",
- " google \n",
- " formation \n",
- " fully \n",
- " \n",
- " \n",
- " \n",
- "9 \n",
- " training \n",
- " data \n",
- " lstm \n",
- " softmax \n",
- " rate \n",
- " trans \n",
- " output \n",
- " domesticated \n",
- " language \n",
- " results \n",
- "
%{customdata[1]}
Size: %{customdata[2]}",
- "legendgroup": "",
- "marker": {
- "color": "#B0BEC5",
- "line": {
- "color": "DarkSlateGrey",
- "width": 2
- },
- "size": [
- 723,
- 198,
- 122,
- 61,
- 57,
- 46,
- 29,
- 17,
- 13
- ],
- "sizemode": "area",
- "sizeref": 0.451875,
- "symbol": "circle"
- },
- "mode": "markers",
- "name": "",
- "orientation": "v",
- "showlegend": false,
- "type": "scatter",
- "x": [
- 14.759990692138672,
- 14.329012870788574,
- 10.99558162689209,
- 9.891719818115234,
- 11.191701889038086,
- 9.449606895446777,
- 11.662773132324219,
- 14.039092063903809,
- 12.023329734802246
- ],
- "xaxis": "x",
- "y": [
- 1.6729466915130615,
- 2.2927768230438232,
- 5.36309289932251,
- 5.59792423248291,
- 4.721500873565674,
- 5.3096089363098145,
- 5.3371052742004395,
- 1.8039934635162354,
- 4.149565696716309
- ],
- "yaxis": "y"
- }
- ],
- "layout": {
- "annotations": [
- {
- "showarrow": false,
- "text": "D1",
- "x": 8.03216586112976,
- "y": 3.929808777570724,
- "yshift": 10
- },
- {
- "showarrow": false,
- "text": "D2",
- "x": 12.503077578544616,
- "xshift": 10,
- "y": 6.437612867355346
- }
- ],
- "height": 650,
- "hoverlabel": {
- "bgcolor": "white",
- "font": {
- "family": "Rockwell",
- "size": 16
- }
- },
- "legend": {
- "itemsizing": "constant",
- "tracegroupgap": 0
- },
- "margin": {
- "t": 60
- },
- "shapes": [
- {
- "line": {
- "color": "#CFD8DC",
- "width": 2
- },
- "type": "line",
- "x0": 12.503077578544616,
- "x1": 12.503077578544616,
- "y0": 1.4220046877861023,
- "y1": 6.437612867355346
- },
- {
- "line": {
- "color": "#9E9E9E",
- "width": 2
- },
- "type": "line",
- "x0": 8.03216586112976,
- "x1": 16.973989295959473,
- "y0": 3.929808777570724,
- "y1": 3.929808777570724
- }
- ],
- "sliders": [
- {
- "active": 0,
- "pad": {
- "t": 50
- },
- "steps": [
- {
- "args": [
- {
- "marker.color": [
- [
- "red",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5"
- ]
- ]
- }
- ],
- "label": "Topic 0",
- "method": "update"
- },
- {
- "args": [
- {
- "marker.color": [
- [
- "#B0BEC5",
- "red",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5"
- ]
- ]
- }
- ],
- "label": "Topic 1",
- "method": "update"
- },
- {
- "args": [
- {
- "marker.color": [
- [
- "#B0BEC5",
- "#B0BEC5",
- "red",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5"
- ]
- ]
- }
- ],
- "label": "Topic 2",
- "method": "update"
- },
- {
- "args": [
- {
- "marker.color": [
- [
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "red",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5"
- ]
- ]
- }
- ],
- "label": "Topic 3",
- "method": "update"
- },
- {
- "args": [
- {
- "marker.color": [
- [
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "red",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5"
- ]
- ]
- }
- ],
- "label": "Topic 4",
- "method": "update"
- },
- {
- "args": [
- {
- "marker.color": [
- [
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "red",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5"
- ]
- ]
- }
- ],
- "label": "Topic 5",
- "method": "update"
- },
- {
- "args": [
- {
- "marker.color": [
- [
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "red",
- "#B0BEC5",
- "#B0BEC5"
- ]
- ]
- }
- ],
- "label": "Topic 6",
- "method": "update"
- },
- {
- "args": [
- {
- "marker.color": [
- [
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "red",
- "#B0BEC5"
- ]
- ]
- }
- ],
- "label": "Topic 7",
- "method": "update"
- },
- {
- "args": [
- {
- "marker.color": [
- [
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "#B0BEC5",
- "red"
- ]
- ]
- }
- ],
- "label": "Topic 8",
- "method": "update"
- }
- ]
- }
- ],
- "template": {
- "data": {
- "bar": [
- {
- "error_x": {
- "color": "rgb(36,36,36)"
- },
- "error_y": {
- "color": "rgb(36,36,36)"
- },
- "marker": {
- "line": {
- "color": "white",
- "width": 0.5
- },
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "bar"
- }
- ],
- "barpolar": [
- {
- "marker": {
- "line": {
- "color": "white",
- "width": 0.5
- },
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "barpolar"
- }
- ],
- "carpet": [
- {
- "aaxis": {
- "endlinecolor": "rgb(36,36,36)",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "rgb(36,36,36)"
- },
- "baxis": {
- "endlinecolor": "rgb(36,36,36)",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "rgb(36,36,36)"
- },
- "type": "carpet"
- }
- ],
- "choropleth": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "type": "choropleth"
- }
- ],
- "contour": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "colorscale": [
- [
- 0,
- "#440154"
- ],
- [
- 0.1111111111111111,
- "#482878"
- ],
- [
- 0.2222222222222222,
- "#3e4989"
- ],
- [
- 0.3333333333333333,
- "#31688e"
- ],
- [
- 0.4444444444444444,
- "#26828e"
- ],
- [
- 0.5555555555555556,
- "#1f9e89"
- ],
- [
- 0.6666666666666666,
- "#35b779"
- ],
- [
- 0.7777777777777778,
- "#6ece58"
- ],
- [
- 0.8888888888888888,
- "#b5de2b"
- ],
- [
- 1,
- "#fde725"
- ]
- ],
- "type": "contour"
- }
- ],
- "contourcarpet": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "type": "contourcarpet"
- }
- ],
- "heatmap": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "colorscale": [
- [
- 0,
- "#440154"
- ],
- [
- 0.1111111111111111,
- "#482878"
- ],
- [
- 0.2222222222222222,
- "#3e4989"
- ],
- [
- 0.3333333333333333,
- "#31688e"
- ],
- [
- 0.4444444444444444,
- "#26828e"
- ],
- [
- 0.5555555555555556,
- "#1f9e89"
- ],
- [
- 0.6666666666666666,
- "#35b779"
- ],
- [
- 0.7777777777777778,
- "#6ece58"
- ],
- [
- 0.8888888888888888,
- "#b5de2b"
- ],
- [
- 1,
- "#fde725"
- ]
- ],
- "type": "heatmap"
- }
- ],
- "heatmapgl": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "colorscale": [
- [
- 0,
- "#440154"
- ],
- [
- 0.1111111111111111,
- "#482878"
- ],
- [
- 0.2222222222222222,
- "#3e4989"
- ],
- [
- 0.3333333333333333,
- "#31688e"
- ],
- [
- 0.4444444444444444,
- "#26828e"
- ],
- [
- 0.5555555555555556,
- "#1f9e89"
- ],
- [
- 0.6666666666666666,
- "#35b779"
- ],
- [
- 0.7777777777777778,
- "#6ece58"
- ],
- [
- 0.8888888888888888,
- "#b5de2b"
- ],
- [
- 1,
- "#fde725"
- ]
- ],
- "type": "heatmapgl"
- }
- ],
- "histogram": [
- {
- "marker": {
- "line": {
- "color": "white",
- "width": 0.6
- }
- },
- "type": "histogram"
- }
- ],
- "histogram2d": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "colorscale": [
- [
- 0,
- "#440154"
- ],
- [
- 0.1111111111111111,
- "#482878"
- ],
- [
- 0.2222222222222222,
- "#3e4989"
- ],
- [
- 0.3333333333333333,
- "#31688e"
- ],
- [
- 0.4444444444444444,
- "#26828e"
- ],
- [
- 0.5555555555555556,
- "#1f9e89"
- ],
- [
- 0.6666666666666666,
- "#35b779"
- ],
- [
- 0.7777777777777778,
- "#6ece58"
- ],
- [
- 0.8888888888888888,
- "#b5de2b"
- ],
- [
- 1,
- "#fde725"
- ]
- ],
- "type": "histogram2d"
- }
- ],
- "histogram2dcontour": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "colorscale": [
- [
- 0,
- "#440154"
- ],
- [
- 0.1111111111111111,
- "#482878"
- ],
- [
- 0.2222222222222222,
- "#3e4989"
- ],
- [
- 0.3333333333333333,
- "#31688e"
- ],
- [
- 0.4444444444444444,
- "#26828e"
- ],
- [
- 0.5555555555555556,
- "#1f9e89"
- ],
- [
- 0.6666666666666666,
- "#35b779"
- ],
- [
- 0.7777777777777778,
- "#6ece58"
- ],
- [
- 0.8888888888888888,
- "#b5de2b"
- ],
- [
- 1,
- "#fde725"
- ]
- ],
- "type": "histogram2dcontour"
- }
- ],
- "mesh3d": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "type": "mesh3d"
- }
- ],
- "parcoords": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "parcoords"
- }
- ],
- "pie": [
- {
- "automargin": true,
- "type": "pie"
- }
- ],
- "scatter": [
- {
- "fillpattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- },
- "type": "scatter"
- }
- ],
- "scatter3d": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "marker": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "scatter3d"
- }
- ],
- "scattercarpet": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "scattercarpet"
- }
- ],
- "scattergeo": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "scattergeo"
- }
- ],
- "scattergl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "scattergl"
- }
- ],
- "scattermapbox": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "scattermapbox"
- }
- ],
- "scatterpolar": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "scatterpolar"
- }
- ],
- "scatterpolargl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "scatterpolargl"
- }
- ],
- "scatterternary": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "type": "scatterternary"
- }
- ],
- "surface": [
- {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- },
- "colorscale": [
- [
- 0,
- "#440154"
- ],
- [
- 0.1111111111111111,
- "#482878"
- ],
- [
- 0.2222222222222222,
- "#3e4989"
- ],
- [
- 0.3333333333333333,
- "#31688e"
- ],
- [
- 0.4444444444444444,
- "#26828e"
- ],
- [
- 0.5555555555555556,
- "#1f9e89"
- ],
- [
- 0.6666666666666666,
- "#35b779"
- ],
- [
- 0.7777777777777778,
- "#6ece58"
- ],
- [
- 0.8888888888888888,
- "#b5de2b"
- ],
- [
- 1,
- "#fde725"
- ]
- ],
- "type": "surface"
- }
- ],
- "table": [
- {
- "cells": {
- "fill": {
- "color": "rgb(237,237,237)"
- },
- "line": {
- "color": "white"
- }
- },
- "header": {
- "fill": {
- "color": "rgb(217,217,217)"
- },
- "line": {
- "color": "white"
- }
- },
- "type": "table"
- }
- ]
- },
- "layout": {
- "annotationdefaults": {
- "arrowhead": 0,
- "arrowwidth": 1
- },
- "autotypenumbers": "strict",
- "coloraxis": {
- "colorbar": {
- "outlinewidth": 1,
- "tickcolor": "rgb(36,36,36)",
- "ticks": "outside"
- }
- },
- "colorscale": {
- "diverging": [
- [
- 0,
- "rgb(103,0,31)"
- ],
- [
- 0.1,
- "rgb(178,24,43)"
- ],
- [
- 0.2,
- "rgb(214,96,77)"
- ],
- [
- 0.3,
- "rgb(244,165,130)"
- ],
- [
- 0.4,
- "rgb(253,219,199)"
- ],
- [
- 0.5,
- "rgb(247,247,247)"
- ],
- [
- 0.6,
- "rgb(209,229,240)"
- ],
- [
- 0.7,
- "rgb(146,197,222)"
- ],
- [
- 0.8,
- "rgb(67,147,195)"
- ],
- [
- 0.9,
- "rgb(33,102,172)"
- ],
- [
- 1,
- "rgb(5,48,97)"
- ]
- ],
- "sequential": [
- [
- 0,
- "#440154"
- ],
- [
- 0.1111111111111111,
- "#482878"
- ],
- [
- 0.2222222222222222,
- "#3e4989"
- ],
- [
- 0.3333333333333333,
- "#31688e"
- ],
- [
- 0.4444444444444444,
- "#26828e"
- ],
- [
- 0.5555555555555556,
- "#1f9e89"
- ],
- [
- 0.6666666666666666,
- "#35b779"
- ],
- [
- 0.7777777777777778,
- "#6ece58"
- ],
- [
- 0.8888888888888888,
- "#b5de2b"
- ],
- [
- 1,
- "#fde725"
- ]
- ],
- "sequentialminus": [
- [
- 0,
- "#440154"
- ],
- [
- 0.1111111111111111,
- "#482878"
- ],
- [
- 0.2222222222222222,
- "#3e4989"
- ],
- [
- 0.3333333333333333,
- "#31688e"
- ],
- [
- 0.4444444444444444,
- "#26828e"
- ],
- [
- 0.5555555555555556,
- "#1f9e89"
- ],
- [
- 0.6666666666666666,
- "#35b779"
- ],
- [
- 0.7777777777777778,
- "#6ece58"
- ],
- [
- 0.8888888888888888,
- "#b5de2b"
- ],
- [
- 1,
- "#fde725"
- ]
- ]
- },
- "colorway": [
- "#1F77B4",
- "#FF7F0E",
- "#2CA02C",
- "#D62728",
- "#9467BD",
- "#8C564B",
- "#E377C2",
- "#7F7F7F",
- "#BCBD22",
- "#17BECF"
- ],
- "font": {
- "color": "rgb(36,36,36)"
- },
- "geo": {
- "bgcolor": "white",
- "lakecolor": "white",
- "landcolor": "white",
- "showlakes": true,
- "showland": true,
- "subunitcolor": "white"
- },
- "hoverlabel": {
- "align": "left"
- },
- "hovermode": "closest",
- "mapbox": {
- "style": "light"
- },
- "paper_bgcolor": "white",
- "plot_bgcolor": "white",
- "polar": {
- "angularaxis": {
- "gridcolor": "rgb(232,232,232)",
- "linecolor": "rgb(36,36,36)",
- "showgrid": false,
- "showline": true,
- "ticks": "outside"
- },
- "bgcolor": "white",
- "radialaxis": {
- "gridcolor": "rgb(232,232,232)",
- "linecolor": "rgb(36,36,36)",
- "showgrid": false,
- "showline": true,
- "ticks": "outside"
- }
- },
- "scene": {
- "xaxis": {
- "backgroundcolor": "white",
- "gridcolor": "rgb(232,232,232)",
- "gridwidth": 2,
- "linecolor": "rgb(36,36,36)",
- "showbackground": true,
- "showgrid": false,
- "showline": true,
- "ticks": "outside",
- "zeroline": false,
- "zerolinecolor": "rgb(36,36,36)"
- },
- "yaxis": {
- "backgroundcolor": "white",
- "gridcolor": "rgb(232,232,232)",
- "gridwidth": 2,
- "linecolor": "rgb(36,36,36)",
- "showbackground": true,
- "showgrid": false,
- "showline": true,
- "ticks": "outside",
- "zeroline": false,
- "zerolinecolor": "rgb(36,36,36)"
- },
- "zaxis": {
- "backgroundcolor": "white",
- "gridcolor": "rgb(232,232,232)",
- "gridwidth": 2,
- "linecolor": "rgb(36,36,36)",
- "showbackground": true,
- "showgrid": false,
- "showline": true,
- "ticks": "outside",
- "zeroline": false,
- "zerolinecolor": "rgb(36,36,36)"
- }
- },
- "shapedefaults": {
- "fillcolor": "black",
- "line": {
- "width": 0
- },
- "opacity": 0.3
- },
- "ternary": {
- "aaxis": {
- "gridcolor": "rgb(232,232,232)",
- "linecolor": "rgb(36,36,36)",
- "showgrid": false,
- "showline": true,
- "ticks": "outside"
- },
- "baxis": {
- "gridcolor": "rgb(232,232,232)",
- "linecolor": "rgb(36,36,36)",
- "showgrid": false,
- "showline": true,
- "ticks": "outside"
- },
- "bgcolor": "white",
- "caxis": {
- "gridcolor": "rgb(232,232,232)",
- "linecolor": "rgb(36,36,36)",
- "showgrid": false,
- "showline": true,
- "ticks": "outside"
- }
- },
- "title": {
- "x": 0.05
- },
- "xaxis": {
- "automargin": true,
- "gridcolor": "rgb(232,232,232)",
- "linecolor": "rgb(36,36,36)",
- "showgrid": false,
- "showline": true,
- "ticks": "outside",
- "title": {
- "standoff": 15
- },
- "zeroline": false,
- "zerolinecolor": "rgb(36,36,36)"
- },
- "yaxis": {
- "automargin": true,
- "gridcolor": "rgb(232,232,232)",
- "linecolor": "rgb(36,36,36)",
- "showgrid": false,
- "showline": true,
- "ticks": "outside",
- "title": {
- "standoff": 15
- },
- "zeroline": false,
- "zerolinecolor": "rgb(36,36,36)"
- }
- }
- },
- "title": {
- "font": {
- "color": "Black",
- "size": 22
- },
- "text": "Intertopic Distance Map",
- "x": 0.5,
- "xanchor": "center",
- "y": 0.95,
- "yanchor": "top"
- },
- "width": 650,
- "xaxis": {
- "anchor": "y",
- "domain": [
- 0,
- 1
- ],
- "range": [
- 8.03216586112976,
- 16.973989295959473
- ],
- "title": {
- "text": ""
- },
- "visible": false
- },
- "yaxis": {
- "anchor": "x",
- "domain": [
- 0,
- 1
- ],
- "range": [
- 1.4220046877861023,
- 6.437612867355346
- ],
- "title": {
- "text": ""
- },
- "visible": false
- }
- }
- }
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "topic_model.visualize_topics()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.15"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/chroma-news-of-the-day/README.md b/examples/chroma-news-of-the-day/README.md
deleted file mode 100644
index 1b76b8c24a..0000000000
--- a/examples/chroma-news-of-the-day/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-## News of the Day
-
-This notebook shows how to use Unstructured + Chroma + LangChain to preprocess and
-summarize the news of the day from CNN Lite. To run this notebook, first install
-the dependencies with `pip install -r requirements.txt`. Then run `jupyter lab` and
-open the notebook.
diff --git a/examples/chroma-news-of-the-day/news-of-the-day.ipynb b/examples/chroma-news-of-the-day/news-of-the-day.ipynb
deleted file mode 100644
index 11739abf4e..0000000000
--- a/examples/chroma-news-of-the-day/news-of-the-day.ipynb
+++ /dev/null
@@ -1,275 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "a92fa55b-7051-4aad-9aae-15eec21704d1",
- "metadata": {},
- "source": [
- "# News of the Day\n",
- "\n",
- "In this notebook, we'll show how to use [Unstructured.IO](https://unstructured.io/), [ChromaDB](https://www.trychroma.com/), and [LangChain](https://github.com/langchain-ai/langchain) to summarize topics from the front page of CNN Lite. Without tooling from the modern LLM stack, this would have been a time-consuming project. With Unstructured, Chroma, and LangChain, the entire workflow is less than two dozen lines of code."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "28208685-c4a9-4e59-973c-2144f64dc275",
- "metadata": {},
- "source": [
- "## Gather links with `unstructured`\n",
- "\n",
- "First, we'll gather links from the [CNN Lite](https://lite.cnn.com/) homepage using the `partition_html` function from `unstructured`. When `unstructured` partitions HTML pages, links are included in the metadata for each element, make link collection a simple task. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "d994c585-af48-416e-99fa-6bf9f736c4f7",
- "metadata": {},
- "outputs": [],
- "source": [
- "from unstructured.partition.html import partition_html"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "f3ae24d4-2926-4ea3-ad12-52e372918039",
- "metadata": {},
- "outputs": [],
- "source": [
- "cnn_lite_url = \"https://lite.cnn.com/\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "deca76d9-8f38-4dcc-a34a-466aaba3ab45",
- "metadata": {},
- "outputs": [],
- "source": [
- "elements = partition_html(url=cnn_lite_url)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "cb258d76-459e-4249-88e3-5bd340e06a32",
- "metadata": {},
- "outputs": [],
- "source": [
- "links = []\n",
- "\n",
- "for element in elements:\n",
- " if element.metadata.link_urls:\n",
- " relative_link = element.metadata.link_urls[0][1:]\n",
- " if relative_link.startswith(\"2024\"):\n",
- " links.append(f\"{cnn_lite_url}{relative_link}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "c4189214-0cf4-4ccf-b996-bb89c1d30233",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "98"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(links)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0e340a96-e66b-431a-8416-4a9b4d2bdadf",
- "metadata": {},
- "source": [
- "## Ingest individual articles with `UnstructuredURLLoader`\n",
- "\n",
- "Now that we have the links, we can preprocess individual news articles with `UnstructuredURLLoader`. `UnstructuredURLLoader` fetches content from the web and then uses the `unstructured` `partition` function to extract content and metadata. In this example we preprocess HTML files, but it works with other response types such as `application/pdf` as well. After calling `.load()`, the result is a list of `langchain` `Document` objects."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "cd629187-dcd7-4411-8e61-dea34bae2963",
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain.document_loaders import UnstructuredURLLoader\n",
- "\n",
- "loaders = UnstructuredURLLoader(urls=links[:20], show_progress_bar=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "6249c4c4-4c0d-4c49-8f0b-4e6f97b5d8c8",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00, 4.67it/s]\n"
- ]
- }
- ],
- "source": [
- "docs = loaders.load()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "aef8c3ea-26e9-4a09-8314-0d1e7580ae26",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Document(page_content='CNN\\n\\n3/13/2024\\n\\nRFK Jr.’s VP prospect Aaron Rodgers has shared\\xa0false\\xa0Sandy Hook conspiracy theories\\xa0in private conversations\\n\\nBy Pamela Brown and Jake Tapper, CNN\\n\\nUpdated: \\n 5:33 PM EDT, Wed March 13, 2024\\n\\nSource: CNN\\n\\nIndependent presidential candidate Robert F. Kennedy Jr. has confirmed that among his potential vice-presidential prospects is New York Jets quarterback Aaron Rodgers,\\xa0who\\xa0in private conversations shared deranged conspiracy theories\\xa0about the 2012 Sandy Hook school shooting not being real.\\n\\nCNN knows of two people with whom Rodgers has enthusiastically shared these stories,\\xa0including with Pamela Brown, one of the journalists writing this piece.\\n\\nBrown was covering the Kentucky Derby for CNN in\\xa02013\\xa0when she was introduced to Rodgers, then with the Green Bay Packers, at a post-Derby party. Hearing that she was a journalist with CNN, Rodgers immediately began attacking the news media for covering up important stories. Rodgers brought up the tragic killing\\xa0of 20 children and 6 adults by a gunman at Sandy Hook Elementary School,\\xa0claiming\\xa0it was actually a government inside job and the media was intentionally ignoring it.\\n\\nWhen Brown questioned him on the evidence to show this very real shooting was staged, Rodgers began sharing various theories that have been disproven numerous times. Such conspiracy theories were also later at the center of lawsuits brought by victims’ families when they sued conspiracy theorist Alex Jones on the matter.\\n\\nJones baselessly repeating lies that the 2012 mass shooting was staged, and that the families and first responders were “crisis actors,” spawned multiple lawsuits and a trial was held in 2022 over lawsuits that were filed in Connecticut.\\n\\nFamily members throughout that trial described in poignant terms how the lies had prompted unrelenting harassment against them and compounded the emotional agony of losing their loved ones.\\n\\nBrown recalls Rodgers asking her if she thought it was off that there were men in black in the woods by the school,\\xa0falsely\\xa0claiming those men were actually government operatives. Brown found the encounter disturbing.\\n\\nRodgers, through one of his agents, declined to comment to CNN.\\n\\nCNN has spoken to another person with a similar story. This person, to whom CNN has granted anonymity so as to avoid harassment, recalled that several years ago, Rodgers claimed, “Sandy Hook never happened…All those children never existed. They were all actors.”\\n\\nWhen asked about the grieving parents, the source recalled Rodgers saying, “They’re all making it up. They’re all actors.”\\n\\nRodgers went on to delve into some of the darker caverns of the false conspiracy theory. This person found the encounter disturbing.\\n\\nKennedy’s campaign\\xa0announced Wednesday that the candidate will name his running mate on March 26 in Oakland, California. Kennedy told CNN in an interview on Tuesday that he had recently met with Rodgers, as well as former Minnesota Gov. Jesse Ventura, about the possibility of joining his campaign.\\n\\nSee Full Web Article\\n\\nGo to the full CNN experience\\n\\n© 2024 Cable News Network. A Warner Bros. Discovery Company. All Rights Reserved.\\n\\nTerms of Use\\n\\nPrivacy Policy\\n\\nAd Choices\\n\\nCookie Settings', metadata={'source': 'https://lite.cnn.com/2024/03/13/politics/aaron-rodgers-sandy-hook-conspiracy-theories/index.html'})"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "docs[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "90852a76-6804-4eb4-9929-b15a3dcfec4f",
- "metadata": {},
- "source": [
- "## Load documents into ChromaDB\n",
- "\n",
- "With the documents preprocessed, we're now ready to load them into ChromaDB. We accomplish this easily by using the OpenAI embeddings the Chroma vectrostore from `langchain`. This workflow will vectorize the documents using the OpenAI embeddings endpoint, and then load the documents and associated vectors into Chroma. Once the documents are in Chroma, we can perform a similarity search to retrieve documents related to our topic of interest."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "dfdf919b-f7b1-4ea3-af49-44d333c53c4d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain.vectorstores.chroma import Chroma\n",
- "from langchain.embeddings import OpenAIEmbeddings"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "4576e386-0fca-4d73-88dd-7df7d1afdef8",
- "metadata": {},
- "outputs": [],
- "source": [
- "embeddings = OpenAIEmbeddings()\n",
- "vectorstore = Chroma.from_documents(docs, embeddings)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "dc31bc23-21e8-4c96-9eb2-5a09734ca936",
- "metadata": {},
- "outputs": [],
- "source": [
- "query_docs = vectorstore.similarity_search(\n",
- " \"What is behind the rapid increase in car insurance rates?\", k=1\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d1c1498d-6b68-4c7f-9787-78c06f5992f1",
- "metadata": {},
- "source": [
- "## Summarize the Documents\n",
- "\n",
- "After retrieving relevant documents from Chroma, we're ready to summarize them! There are multiple ways to accomplish this in `langchain`, but `load_summarization_chain` is the easiest. Simply choose an LLM, load the summarization chain, and you're ready to summarize the documents. Here we limit the summary to snippets related to our topic of choice."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "85326ac4-ed5b-4ae9-aae0-854c8e0587b0",
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain.chat_models import ChatOpenAI\n",
- "from langchain.chains.summarize import load_summarize_chain"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "de09860c-0b95-4a91-abf3-92c683446c00",
- "metadata": {},
- "outputs": [],
- "source": [
- "llm = ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo-16k\")\n",
- "chain = load_summarize_chain(llm, chain_type=\"stuff\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "c3e4e37d-286a-4d58-813a-3c4a02f31ab0",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Car insurance rates in the US have increased by almost 21% in the past year, contributing to the overall inflation rate. The rise can be attributed to rising car repair costs, more severe and frequent car accidents, and riskier driving behaviors. The increase in rates varies by state, with Nevada experiencing the highest jump and North Carolina the smallest. While rates are expected to moderate nationally in the second half of 2024, some markets may continue to see increases.\n"
- ]
- }
- ],
- "source": [
- "print(chain.run(query_docs))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "283ed440-042c-4a84-8956-54a4912d9da6",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/chroma-news-of-the-day/requirements.txt b/examples/chroma-news-of-the-day/requirements.txt
deleted file mode 100644
index 45cf1453c2..0000000000
--- a/examples/chroma-news-of-the-day/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-chromadb
-langchain
-unstructured
diff --git a/examples/custom-layout-order/README.md b/examples/custom-layout-order/README.md
deleted file mode 100644
index 1e13af6ac6..0000000000
--- a/examples/custom-layout-order/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Custom Layout Sorting
-
-This directory contains examples of how element sorting works.
-
-## Running the example
-
-### Running script(.py)
-
-```
-export PYTHONPATH=.:$PYTHONPATH && python examples/custom-layout-order/evaluate_natural_reading_order.py " \
- --index-name "
" \
- --index-name "
" \
- --index-name "
" \
- --index-name "
\n",
- " \n",
- "
\n",
- "\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " type \n",
- " text \n",
- " element_id \n",
- " coordinates \n",
- " filename \n",
- " page_number \n",
- " url \n",
- " \n",
- " \n",
- " 0 \n",
- " NarrativeText \n",
- " This is a test email to use for unit tests. \n",
- " f49fbd614ddf5b72e06f59e554e6ae2b \n",
- " NaN \n",
- " ../../example-docs/fake-email.eml \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " 1 \n",
- " Title \n",
- " Important points: \n",
- " 9c218520320f238595f1fde74bdd137d \n",
- " NaN \n",
- " ../../example-docs/fake-email.eml \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " 2 \n",
- " ListItem \n",
- " Roses are red \n",
- " 8522061b991b1db70453502d328fe07e \n",
- " NaN \n",
- " ../../example-docs/fake-email.eml \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " 3 \n",
- " ListItem \n",
- " Violets are blue \n",
- " c3c4527761d4e4b8d0a4c4a0d46954c8 \n",
- " NaN \n",
- " ../../example-docs/fake-email.eml \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " \n",
- "4 \n",
- " Title \n",
- " Lorem ipsum dolor sit amet. \n",
- " dd14cbbf0e74909aac7f248a85d190af \n",
- " NaN \n",
- " ../../example-docs/fake.docx \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- "
\n",
- "\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " type \n",
- " text \n",
- " element_id \n",
- " coordinates \n",
- " filename \n",
- " page_number \n",
- " url \n",
- " \n",
- " \n",
- " \n",
- "0 \n",
- " NarrativeText \n",
- " This is a test email to use for unit tests. \n",
- " f49fbd614ddf5b72e06f59e554e6ae2b \n",
- " None \n",
- " ../../example-docs/fake-email.eml \n",
- " None \n",
- " None \n",
- " \n",
- " \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Step \n",
- " Training Loss \n",
- " \n",
- " \n",
- " \n",
- "500 \n",
- " 0.233400 \n",
- " \n",
- " \n",
- "
\n",
- "\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " type \n",
- " text \n",
- " \n",
- " \n",
- " 0 \n",
- " Title \n",
- " Skip to main content \n",
- " \n",
- " \n",
- " 1 \n",
- " NarrativeText \n",
- " (function(d){\\n var js, id = 'facebook-jssdk'... \n",
- " \n",
- " \n",
- " 2 \n",
- " Title \n",
- " Search form \n",
- " \n",
- " \n",
- " 3 \n",
- " ListItem \n",
- " Home \n",
- " \n",
- " \n",
- " \n",
- "4 \n",
- " ListItem \n",
- " Who We Are \n",
- " \n",
- " \n",
- "
\n",
- "\n",
- " \n",
- " \n",
- " filesize \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " filetype \n",
- " \n",
- " \n",
- " \n",
- " FileType.DOCX \n",
- " 36602.0 \n",
- " \n",
- " \n",
- " FileType.EML \n",
- " 149088.5 \n",
- " \n",
- " \n",
- " FileType.HTML \n",
- " 1228404.0 \n",
- " \n",
- " \n",
- " FileType.JPG \n",
- " 64002.5 \n",
- " \n",
- " \n",
- " FileType.PDF \n",
- " 2429245.0 \n",
- " \n",
- " \n",
- " FileType.PPTX \n",
- " 38412.0 \n",
- " \n",
- " \n",
- " FileType.TXT \n",
- " 619.0 \n",
- " \n",
- " \n",
- " FileType.UNK \n",
- " 1102.0 \n",
- " \n",
- " \n",
- " FileType.XLSX \n",
- " 4765.0 \n",
- " \n",
- " \n",
- " \n",
- "FileType.XML \n",
- " 713.5 \n",
- " \n"
- "\n"
- "
"
- )
- text_table = "Header Col 1 Header Col 2\n" "Lorem ipsum adipiscing"
- pre_chunk = TablePreChunk(
- Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
- overlap_prefix="ctus porta volutpat.",
- opts=ChunkingOptions(max_characters=175),
- )
+ def it_accumulates_elements_added_to_it(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
- chunk_iter = pre_chunk.iter_chunks()
+ builder.add_element(Title("Introduction"))
+ assert builder._text_length == 12
+ assert builder._remaining_space == 136
- chunk = next(chunk_iter)
- assert isinstance(chunk, Table)
- assert chunk.text == (
- "ctus porta volutpat.\nHeader Col 1 Header Col 2\nLorem ipsum adipiscing"
- )
- assert chunk.metadata.text_as_html == (
- " \n"
- "\n"
- "\n"
- "Header Col 1 Header Col 2 \n"
- "\n"
- "Lorem ipsum adipiscing \n"
- "\n"
- "
"
+ builder.add_element(
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
)
- with pytest.raises(StopIteration):
- next(chunk_iter)
+ assert builder._text_length == 112
+ assert builder._remaining_space == 36
- def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
- table = Table("foo bar", metadata=ElementMetadata(text_as_html=" \n"
- "\n"
- "\n"
- "Header Col 1 Header Col 2 \n"
- "\n"
- "Lorem ipsum adipiscing foo bar
"))
- opts = ChunkingOptions(include_orig_elements=True)
- pre_chunk = TablePreChunk(table, "", opts)
+ @pytest.mark.parametrize("element", [Table("Heading\nCell text"), Text("abcd " * 200)])
+ def it_will_fit_a_Table_or_oversized_element_when_empty(self, element: Element):
+ builder = PreChunkBuilder(opts=ChunkingOptions())
+ assert builder.will_fit(element)
- chunk_iter = pre_chunk.iter_chunks()
+ @pytest.mark.parametrize(
+ ("existing_element", "next_element"),
+ [
+ (Text("abcd"), Table("Fruits\nMango")),
+ (Text("abcd"), Text("abcd " * 200)),
+ (Table("Heading\nCell text"), Table("Fruits\nMango")),
+ (Table("Heading\nCell text"), Text("abcd " * 200)),
+ ],
+ )
+ def but_not_when_it_already_contains_an_element_of_any_kind(
+ self, existing_element: Element, next_element: Element
+ ):
+ builder = PreChunkBuilder(opts=ChunkingOptions())
+ builder.add_element(existing_element)
- chunk = next(chunk_iter)
- assert isinstance(chunk, Table)
- assert chunk.metadata.orig_elements == [table]
- assert chunk.metadata.text_as_html == "foo bar
"
- # --
- with pytest.raises(StopIteration):
- next(chunk_iter)
+ assert not builder.will_fit(next_element)
- def but_not_when_instructed_not_to(self):
- pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
+ @pytest.mark.parametrize("element", [Text("abcd"), Table("Fruits\nMango")])
+ def it_will_not_fit_any_element_when_it_already_contains_a_table(self, element: Element):
+ builder = PreChunkBuilder(opts=ChunkingOptions())
+ builder.add_element(Table("Heading\nCell text"))
- chunk = next(pre_chunk.iter_chunks())
+ assert not builder.will_fit(element)
- assert isinstance(chunk, Table)
- assert chunk.metadata.orig_elements is None
+ def it_will_not_fit_an_element_when_it_already_exceeds_the_soft_maxlen(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100, new_after_n_chars=50))
+ builder.add_element(
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
+ )
- def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
- # fixed-overhead = 8+8+9+8+9+8 = 50
- # per-row overhead = 27
- html_table = (
- "\n" # 8
- "\n" # 8
- "
" # 8
+ assert not builder.will_fit(Text("In rhoncus ipsum."))
+
+ def and_it_will_not_fit_an_element_when_that_would_cause_it_to_exceed_the_hard_maxlen(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
+ builder.add_element(
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)
- text_table = (
- "Header Col 1 Header Col 2\n"
- "Lorem ipsum dolor sit amet\n"
- "Consectetur adipiscing elit\n"
- "Nunc aliquam id enim nec molestie\n"
- "Vivamus quis nunc ipsum donec ac fermentum"
+
+ # -- 55 + 2 (separator) + 44 == 101 --
+ assert not builder.will_fit(
+ Text("In rhoncus ipsum sed lectus portos volutpat.") # 44-chars
)
- pre_chunk = TablePreChunk(
- Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
- overlap_prefix="",
- opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
+
+ def but_it_will_fit_an_element_that_fits(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=100))
+ builder.add_element(
+ Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") # 55-chars
)
- chunk_iter = pre_chunk.iter_chunks()
+ # -- 55 + 2 (separator) + 43 == 100 --
+ assert builder.will_fit(Text("In rhoncus ipsum sed lectus porto volutpat.")) # 43-chars
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
- assert chunk.text == (
- "Header Col 1 Header Col 2\n"
- "Lorem ipsum dolor sit amet\n"
- "Consectetur adipiscing elit"
- )
- assert chunk.metadata.text_as_html == (
- " \n"
- "\n" # 9
- "\n" # 8
- "Header Col 1 Header Col 2 \n"
- "Lorem ipsum A Link example \n"
- "Consectetur adipiscing elit \n"
- "Nunc aliquam id enim nec molestie \n"
- "\n" # 9
- "Vivamus quis nunc ipsum donec ac fermentum \n"
- "\n"
- "
"
+ " \n"
- "\n"
- "\n"
- "Header Col 1 Header Col 2 \n"
- "Lo"
+ def it_generates_a_TextPreChunk_when_flushed_and_resets_itself_to_empty(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
+ builder.add_element(Title("Introduction"))
+ builder.add_element(
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
)
- assert not chunk.metadata.is_continuation
- # --
- chunk = next(chunk_iter)
- assert isinstance(chunk, TableChunk)
+
+ pre_chunk = next(builder.flush())
+
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._elements == [
+ Title("Introduction"),
+ Text(
+ "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed"
+ "lectus porta volutpat.",
+ ),
+ ]
+ assert builder._text_length == 0
+ assert builder._remaining_space == 150
+
+ def and_it_generates_a_TablePreChunk_when_it_contains_a_Table_element(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
+ builder.add_element(Table("Heading\nCell text"))
+
+ pre_chunk = next(builder.flush())
+
+ # -- pre-chunk builder was reset before the yield, such that the iterator does not need to
+ # -- be exhausted before clearing out the old elements and a new pre-chunk can be
+ # -- accumulated immediately (first `next()` call is required however, to advance to the
+ # -- yield statement).
+ assert builder._text_length == 0
+ assert builder._remaining_space == 150
+ # -- pre-chunk is a `TablePreChunk` --
+ assert isinstance(pre_chunk, TablePreChunk)
+ assert pre_chunk._table == Table("Heading\nCell text")
+
+ def but_it_does_not_generate_a_pre_chunk_on_flush_when_empty(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
+
+ pre_chunks = list(builder.flush())
+
+ assert pre_chunks == []
+ assert builder._text_length == 0
+ assert builder._remaining_space == 150
+
+ def it_computes_overlap_from_each_pre_chunk_and_applies_it_to_the_next(self):
+ opts = ChunkingOptions(overlap=15, overlap_all=True)
+ builder = PreChunkBuilder(opts=opts)
+
+ builder.add_element(Text("Lorem ipsum dolor sit amet consectetur adipiscing elit."))
+ pre_chunk = list(builder.flush())[0]
+
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._text == "Lorem ipsum dolor sit amet consectetur adipiscing elit."
+
+ builder.add_element(Table("In rhoncus ipsum sed lectus porta volutpat."))
+ pre_chunk = list(builder.flush())[0]
+
+ assert isinstance(pre_chunk, TablePreChunk)
+ assert pre_chunk._text_with_overlap == (
+ "dipiscing elit.\nIn rhoncus ipsum sed lectus porta volutpat."
+ )
+
+ builder.add_element(Text("Donec semper facilisis metus finibus."))
+ pre_chunk = list(builder.flush())[0]
+
+ assert isinstance(pre_chunk, TextPreChunk)
+ assert pre_chunk._text == "porta volutpat.\n\nDonec semper facilisis metus finibus."
+
+ def it_considers_separator_length_when_computing_text_length_and_remaining_space(self):
+ builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
+ builder.add_element(Text("abcde"))
+ builder.add_element(Text("fghij"))
+
+ # -- ._text_length includes a separator ("\n\n", len==2) between each text-segment,
+ # -- so 5 + 2 + 5 = 12 here, not 5 + 5 = 10
+ assert builder._text_length == 12
+ # -- ._remaining_space is reduced by the length (2) of the trailing separator which would
+ # -- go between the current text and that of the next element if one was added.
+ # -- So 50 - 12 - 2 = 36 here, not 50 - 12 = 38
+ assert builder._remaining_space == 36
+
+
+# ================================================================================================
+# PRE-CHUNK SUBTYPES
+# ================================================================================================
+
+
+class DescribeTablePreChunk:
+ """Unit-test suite for `unstructured.chunking.base.TablePreChunk` objects."""
+
+ def it_uses_its_table_as_the_sole_chunk_when_it_fits_in_the_window(self):
+ html_table = (
+ " \n"
+ "\n"
+ "
"
+ )
+ text_table = "Header Col 1 Header Col 2\nLorem ipsum adipiscing"
+ pre_chunk = TablePreChunk(
+ Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+ overlap_prefix="ctus porta volutpat.",
+ opts=ChunkingOptions(max_characters=175),
+ )
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, Table)
assert chunk.text == (
- "Nunc aliquam id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
+ "ctus porta volutpat.\nHeader Col 1 Header Col 2\nLorem ipsum adipiscing"
+ )
+ assert chunk.metadata.text_as_html == (
+ " \n"
+ "\n"
+ "\n"
+ "Header Col 1 Header Col 2 \n"
+ "\n"
+ "Lorem ipsum adipiscing "
+ "
"
+ )
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def but_not_when_the_table_is_is_empty_or_contains_only_whitespace(self):
+ html_table = " "
+ "Header Col 1 Header Col 2 "
+ "Lorem ipsum adipiscing
"
+ pre_chunk = TablePreChunk(
+ Table(" \t \n ", metadata=ElementMetadata(text_as_html=html_table)),
+ overlap_prefix="volutpat.",
+ opts=ChunkingOptions(max_characters=175),
+ )
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self):
+ table = Table("foo bar", metadata=ElementMetadata(text_as_html=" \t \n foo bar
"))
+ opts = ChunkingOptions(include_orig_elements=True)
+ pre_chunk = TablePreChunk(table, "", opts)
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, Table)
+ assert chunk.metadata.orig_elements == [table]
+ assert chunk.metadata.text_as_html == "foo bar
"
+ # --
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
+ def but_not_when_instructed_not_to(self):
+ pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False))
+
+ chunk = next(pre_chunk.iter_chunks())
+
+ assert isinstance(chunk, Table)
+ assert chunk.metadata.orig_elements is None
+
+ def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self):
+ html_table = """\
+
+
+
+ """
+ text_table = (
+ "Header Col 1 Header Col 2\n"
+ "Lorem ipsum dolor sit amet\n"
+ "Consectetur adipiscing elit\n"
+ "Nunc aliquam id enim nec molestie\n"
+ "Vivamus quis nunc ipsum donec ac fermentum"
+ )
+ pre_chunk = TablePreChunk(
+ Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
+ overlap_prefix="",
+ opts=ChunkingOptions(max_characters=100, text_splitting_separators=("\n", " ")),
+ )
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == "Header Col 1 Header Col 2"
+ assert chunk.metadata.text_as_html == (
+ "
+
+
+ Header Col 1 Header Col 2
+ Lorem ipsum A Link example
+ Consectetur adipiscing elit
+
+ Nunc aliquam id enim nec molestie
"
)
+ assert chunk.metadata.is_continuation is None
+ # --
+ chunk = next(chunk_iter)
+ assert isinstance(chunk, TableChunk)
+ assert chunk.text == "Lorem ipsum A Link example"
assert chunk.metadata.text_as_html == (
- "rem ipsum Header Col 1 Header Col 2 A Link example Consectetur adipiscing elit <"
+ "
"
)
assert chunk.metadata.is_continuation
- # -- note that text runs out but HTML continues because it's significantly longer. So two
- # -- of these chunks have HTML but no text.
+ # --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
- assert chunk.text == ""
+ assert chunk.text == "Consectetur adipiscing elit"
assert chunk.metadata.text_as_html == (
- "/tr>\n"
- "Lorem ipsum A Link example \n"
- "Nunc aliquam id enim nec molestie \n\nVivamus quis "
+ "
"
)
assert chunk.metadata.is_continuation
# --
chunk = next(chunk_iter)
assert isinstance(chunk, TableChunk)
- assert chunk.text == ""
+ assert chunk.text == "Nunc aliquam id enim nec molestie"
assert chunk.metadata.text_as_html == (
- "nunc ipsum donec ac fermentumConsectetur adipiscing elit
"
)
assert chunk.metadata.is_continuation
# --
@@ -483,8 +545,8 @@ def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed
[
# -- normally it splits exactly on overlap size |------- 20 -------|
("In rhoncus ipsum sed lectus porta volutpat.", "ctus porta volutpat."),
- # -- but it strips leading and trailing whitespace when the tail includes it --
- ("In rhoncus ipsum sed lectus porta volutpat. ", "porta volutpat."),
+ # -- but it strips leading whitespace when the tail includes it --
+ ("In rhoncus ipsum sed lectus porta volutpat.", "porta volutpat."),
],
)
def it_computes_its_overlap_tail_for_use_in_inter_pre_chunk_overlap(
@@ -516,7 +578,7 @@ def it_includes_its_overlap_prefix_in_its_text_when_present(
pre_chunk = TablePreChunk(
Table(text), overlap_prefix=overlap_prefix, opts=ChunkingOptions()
)
- assert pre_chunk._text == expected_value
+ assert pre_chunk._text_with_overlap == expected_value
def it_computes_metadata_for_each_chunk_to_help(self):
table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="Nunc aliquam id enim nec molestie "))
@@ -597,6 +659,10 @@ def it_knows_when_it_is_equal_to_another_TextPreChunk_instance(
assert (pre_chunk == other_pre_chunk) is expected_value
+ def and_it_knows_it_is_not_equal_to_an_object_that_is_not_a_TextPreChunk(self):
+ pre_chunk = TextPreChunk([], overlap_prefix="", opts=ChunkingOptions())
+ assert pre_chunk != 42
+
@pytest.mark.parametrize(
("max_characters", "combine_text_under_n_chars", "expected_value"),
[
@@ -771,6 +837,19 @@ def and_it_adds_the_is_continuation_flag_for_second_and_later_text_split_chunks(
assert [c.metadata.is_continuation for c in chunk_iter] == [None, True, True]
+ def but_it_generates_no_chunks_when_the_pre_chunk_contains_no_text(self):
+ metadata = ElementMetadata()
+ pre_chunk = TextPreChunk(
+ [PageBreak("", metadata=metadata)],
+ overlap_prefix="",
+ opts=ChunkingOptions(),
+ )
+
+ chunk_iter = pre_chunk.iter_chunks()
+
+ with pytest.raises(StopIteration):
+ next(chunk_iter)
+
@pytest.mark.parametrize(
("text", "expected_value"),
[
@@ -878,51 +957,6 @@ def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self):
assert orig_elements[0] is element
assert orig_elements[1] is element_2
- def it_consolidates_regex_metadata_in_a_field_specific_way(self):
- """regex_metadata of chunk is combined regex_metadatas of its elements.
-
- Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
- position in the chunk after element text has been concatenated.
- """
- pre_chunk = TextPreChunk(
- [
- Title(
- "Lorem Ipsum",
- metadata=ElementMetadata(
- regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
- ),
- ),
- Text(
- "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
- metadata=ElementMetadata(
- regex_metadata={
- "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
- "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
- },
- ),
- ),
- Text(
- "In rhoncus ipsum sed lectus porta volutpat.",
- metadata=ElementMetadata(
- regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
- ),
- ),
- ],
- overlap_prefix="ficitur.", # len == 8
- opts=ChunkingOptions(),
- )
-
- regex_metadata = pre_chunk._consolidated_regex_meta
-
- assert regex_metadata == {
- "dolor": [RegexMetadata(text="dolor", start=35, end=40)],
- "ipsum": [
- RegexMetadata(text="Ipsum", start=16, end=21),
- RegexMetadata(text="ipsum", start=29, end=34),
- RegexMetadata(text="ipsum", start=91, end=96),
- ],
- }
-
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
@@ -941,7 +975,6 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate
emphasized_text_contents=["Lorem", "Ipsum"],
emphasized_text_tags=["b", "i"],
languages=["lat"],
- regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
@@ -956,11 +989,6 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate
emphasized_text_tags=["i", "b"],
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
languages=["eng", "lat"],
- # -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
- regex_metadata={
- "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
- "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
- },
),
),
],
@@ -975,13 +1003,6 @@ def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strate
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
"emphasized_text_tags": ["b", "i", "i", "b"],
"languages": ["lat", "eng"],
- "regex_metadata": {
- "ipsum": [
- RegexMetadata(text="Ipsum", start=6, end=11),
- RegexMetadata(text="ipsum", start=19, end=24),
- ],
- "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
- },
}
def it_computes_the_original_elements_list_to_help(self):
@@ -1032,171 +1053,462 @@ def it_knows_the_concatenated_text_of_the_pre_chunk_to_help(
# ================================================================================================
-# PRE-CHUNKING ACCUMULATORS
+# PRE-CHUNK SPLITTERS
# ================================================================================================
-class DescribePreChunkBuilder:
- """Unit-test suite for `unstructured.chunking.base.PreChunkBuilder`."""
+class Describe_TableSplitter:
+ """Unit-test suite for `unstructured.chunking.base._TableSplitter`."""
+
+ def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self):
+ opts = ChunkingOptions(max_characters=(150))
+ html_table = HtmlTable.from_html_text(
+ """
+
+
+
+ """
+ )
- def it_is_empty_on_construction(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=50))
+ assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
+ (
+ "Stanley Cups Team Location Stanley Cups",
+ "
+
+ Stanley
+ Cups
+
+
+
+
+ Team
+ Location
+ Stanley Cups
+
+
+ Blues
+ STL
+ 1
+
+
+ Flyers
+ PHI
+ 2
+
+
+
+ Maple Leafs
+ TOR
+ 13
+ "
+ "
",
+ ),
+ (
+ "Blues STL 1 Flyers PHI 2",
+ " "
+ "Stanley Cups "
+ "Team Location Stanley Cups "
+ "
",
+ ),
+ (
+ "Maple Leafs TOR 13",
+ " "
+ "Blues STL 1 "
+ "Flyers PHI 2 " "
",
+ ),
+ ]
- assert builder._text_length == 0
- assert builder._remaining_space == 50
+ def and_it_splits_an_oversized_row_on_an_even_cell_boundary_when_possible(self):
+ opts = ChunkingOptions(max_characters=(100))
+ html_table = HtmlTable.from_html_text(
+ """
+ " "Maple Leafs TOR 13
+
+ """
+ )
- def it_accumulates_elements_added_to_it(self):
- builder = PreChunkBuilder(opts=ChunkingOptions(max_characters=150))
+ assert list(_TableSplitter.iter_subtables(html_table, opts)) == [
+ (
+ "Lorem ipsum dolor sit amet. Consectetur adipiscing elit.",
+ "
+
+ Lorem ipsum dolor sit amet.
+ Consectetur adipiscing elit.
+
+ Laboris nisi ut
+ aliquip ex ea commodo.
+
+
+
+ Duis
+ Dolor
+
+
+ Duis
+ Cillum
+
",
+ ),
+ (
+ "Laboris nisi ut aliquip ex ea commodo.",
+ ""
+ " Lorem ipsum dolor sit amet. "
+ "Consectetur adipiscing elit. "
+ "
",
+ ),
+ (
+ "Duis Dolor Duis Cillum",
+ "Laboris nisi ut aliquip ex ea commodo. "
+ "
",
+ ),
+ ]
- builder.add_element(Title("Introduction"))
- assert builder._text_length == 12
- assert builder._remaining_space == 136
+ def and_it_splits_an_oversized_cell_on_an_even_word_boundary(self):
+ opts = ChunkingOptions(max_characters=(100))
+ html_table = HtmlTable.from_html_text(
+ """
+ "
+ "Duis Dolor "
+ "Duis Cillum
+
+
"
+ "
+
+
+
+
+ Lorem ipsum dolor sit amet,
+ consectetur adipiscing elit.
+ Sed do eiusmod tempor
+ incididunt ut labore et dolore magna aliqua.
+
+ Ut enim ad minim veniam.
+ Quis nostrud exercitation ullamco.
+
+ Duis aute irure dolor
+
+ In reprehenderit voluptate. "
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do
eiusmod tempor incididunt ut labore et dolore magna aliqua. |
Ut enim ad minim veniam. | " + "Quis nostrud exercitation ullamco. | " + "
Duis aute irure dolor |
In reprehenderit voluptate. |
... |
abcde fghij klmno |
abcde fghij klmno | pqrst uvwxy z |
abcde fghij klmno |
abcde fghij klmno |
pqrst uvwxy z |
First
+Second
+First
' + metadata_3 = 'Second
' + + elements = [ + Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)), + NarrativeText( + text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1") + ), + NarrativeText( + text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1") + ), + ] + chunks = chunking_fn(elements, max_characters=6) + assert len(chunks) == 2 + assert chunks[0].text == "First" + assert chunks[1].text == "Second" + + assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2 + assert chunks[1].metadata.text_as_html == metadata_3 + + +def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn): + """Mimic behaviour of elements with non-html metadata""" + metadata_1 = 'cell1 | cell3 | |
cell5 |
<>&" | newline |
cell1 | cell2 |
cell3 | cell4 |
foobar |
foobar |
foobar |
foobar |
foobar |
foobar |
foobar |
foobar |
foobar |
foobar |
foobar |
a | b | |
---|---|---|
c |
a | b | |
c |
\tabc def\nghi | \n
abc def ghi |
foobar |
foobar |
abc | def | ghi |
jkl | mno | pqr |
stu | vwx | yz |
a\n b c | def | |
---|---|---|
gh \ti | \n jk l | |
m n op\n |
+ Description + | ++ Row header + | +
---|---|
+ Value description + | ++ + 50 $ + + + (1.32 %) + + | +
+ Paragraph text +
++ The following table summarizes our intangible assets by business segment at the dates indicated: +
++ | ++ June 30, 2023 + | ++ December 31, 2022 + | +||||
---|---|---|---|---|---|---|
+ | ++ Gross Value + | ++ Accumulated Amortization + | ++ Carrying Value + | ++ Gross Value + | ++ Accumulated Amortization + | ++ Carrying Value + | +
+ NGL Pipelines & Services: + | ++ | ++ | ++ | ++ | ++ | ++ | +
+ Customer relationship intangibles + | +
+
+ $
+
+ + 449 + + |
+
+
+ $
+
+ + (257) + + |
+
+
+ $
+
+ + 192 + + |
+
+
+ $
+
+ + 449 + + |
+
+
+ $
+
+ + (249) + + |
+
+
+ $
+
+ + 200 + + |
+
+ Contract-based intangibles + | ++ 751 + | ++ (95) + | ++ 656 + | ++ 749 + | ++ (84) + | ++ 665 + | +
+ Segment total + | ++ 1,200 + | ++ (352) + | ++ 848 + | ++ 1,198 + | ++ (333) + | ++ 865 + | +
+ Crude Oil Pipelines & Services: + | ++ | ++ | ++ | ++ | ++ | ++ | +
+ Customer relationship intangibles + | ++ 2,195 + | ++ (477) + | ++ 1,718 + | ++ 2,195 + | ++ (431) + | ++ 1,764 + | +
+ Contract-based intangibles + | ++ 283 + | ++ (273) + | ++ 10 + | ++ 283 + | ++ (271) + | ++ 12 + | +
+ Segment total + | ++ 2,478 + | ++ (750) + | ++ 1,728 + | ++ 2,478 + | ++ (702) + | ++ 1,776 + | +
+ Natural Gas Pipelines & Services: + | ++ | ++ | ++ | ++ | ++ | ++ | +
+ Customer relationship intangibles + | ++ 1,350 + | ++ (607) + | ++ 743 + | ++ 1,350 + | ++ (588) + | ++ 762 + | +
+ Contract-based intangibles + | ++ 639 + | ++ (201) + | ++ 438 + | ++ 639 + | ++ (195) + | ++ 444 + | +
+ Segment total + | ++ 1,989 + | ++ (808) + | ++ 1,181 + | ++ 1,989 + | ++ (783) + | ++ 1,206 + | +
+ Petrochemical & Refined Products Services: + | ++ | ++ | ++ | ++ | ++ | ++ | +
+ Customer relationship intangibles + | ++ 181 + | ++ (83) + | ++ 98 + | ++ 181 + | ++ (80) + | ++ 101 + | +
+ Contract-based intangibles + | ++ 45 + | ++ (29) + | ++ 16 + | ++ 45 + | ++ (28) + | ++ 17 + | +
+ Segment total + | ++ 226 + | ++ (112) + | ++ 114 + | ++ 226 + | ++ (108) + | ++ 118 + | +
+ Total intangible assets + | +
+
+ $
+
+ + 5,893 + + |
+
+
+ $
+
+ + (2,022) + + |
+
+
+ $
+
+ + 3,871 + + |
+
+
+ $
+
+ + 5,891 + + |
+
+
+ $
+
+ + (1,926) + + |
+
+
+ $
+
+ + 3,965 + + |
+
+ The following table presents the amortization expense of our intangible assets by business segment for the periods indicated: +
++ | ++ For the Three Months Ended June 30, + | ++ For the Six Months Ended June 30, + | +||
---|---|---|---|---|
+ | ++ 2023 + | ++ 2022 + | ++ 2023 + | ++ 2022 + | +
+ NGL Pipelines & Services + | +
+
+ $
+
+ + 10 + + |
+
+
+ $
+
+ + 9 + + |
+
+
+ $
+
+ + 19 + + |
+
+
+ $
+
+ + 17 + + |
+
+ Crude Oil Pipelines & Services + | ++ 25 + | ++ 21 + | ++ 48 + | ++ 41 + | +
+ Natural Gas Pipelines & Services + | ++ 13 + | ++ 14 + | ++ 25 + | ++ 25 + | +
+ Petrochemical & Refined Products Services + | ++ 2 + | ++ 1 + | ++ 4 + | ++ 3 + | +
+ Total + | +
+
+ $
+
+ + 50 + + |
+
+
+ $
+
+ + 45 + + |
+
+
+ $
+
+ + 96 + + |
+
+
+ $
+
+ + 86 + + |
+
+ The following table presents our forecast of amortization expense associated with existing intangible assets for the periods indicated: +
++ | ++ Remainder of 2023 + | ++ 2024 + | ++ 2025 + | ++ 2026 + | ++ 2027 + | +
---|---|---|---|---|---|
+ + $ + + | ++ 107 + | ++ 222 + | ++ 230 + | ++ 237 + | ++ 235 + | +
+ Goodwill represents the excess of the purchase price of an acquired business over the amounts assigned to assets acquired and liabilities assumed in the transaction. There has been no change in our goodwill amounts since those reported in our 2022 Form 10-K. +
+ + 13 + + ++ The following table presents our consolidated debt obligations (arranged by company and maturity date) at the dates indicated: +
++ | ++ June 30, 2023 + | ++ December 31, 2022 + | +
---|---|---|
+ EPO senior debt obligations: + | ++ | ++ | +
+ Commercial Paper Notes, variable-rates + | +
+
+ $
+
+ + 355 + + |
+
+
+ $
+
+ + 495 + + |
+
+ Senior Notes HH, 3.35% fixed-rate, due March 2023 + | ++ - + | ++ 1,250 + | +
+ Senior Notes JJ, 3.90% fixed-rate, due February 2024 + | ++ 850 + | ++ 850 + | +
+ + March 2023 $1.5 Billion 364-Day Revolving Credit Agreement, variable-rate, due March 2024 + + + (1) + + |
+ + - + | ++ - + | +
+ Senior Notes MM, 3.75% fixed-rate, due February 2025 + | ++ 1,150 + | ++ 1,150 + | +
+ Senior Notes FFF, 5.05% fixed-rate, due January 2026 + | ++ 750 + | ++ - + | +
+ Senior Notes PP, 3.70% fixed-rate, due February 2026 + | ++ 875 + | ++ 875 + | +
+ Senior Notes SS, 3.95% fixed-rate, due February 2027 + | ++ 575 + | ++ 575 + | +
+ + March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement, variable-rate, due March 2028 + + + (2) + + |
+ + - + | ++ - + | +
+ Senior Notes WW, 4.15% fixed-rate, due October 2028 + | ++ 1,000 + | ++ 1,000 + | +
+ Senior Notes YY, 3.125% fixed-rate, due July 2029 + | ++ 1,250 + | ++ 1,250 + | +
+ Senior Notes AAA, 2.80% fixed-rate, due January 2030 + | ++ 1,250 + | ++ 1,250 + | +
+ Senior Notes GGG, 5.35% fixed-rate, due January 2033 + | ++ 1,000 + | ++ - + | +
+ Senior Notes D, 6.875% fixed-rate, due March 2033 + | ++ 500 + | ++ 500 + | +
+ Senior Notes H, 6.65% fixed-rate, due October 2034 + | ++ 350 + | ++ 350 + | +
+ Senior Notes J, 5.75% fixed-rate, due March 2035 + | ++ 250 + | ++ 250 + | +
+ Senior Notes W, 7.95% fixed-rate, due April 2038 + | ++ 400 + | ++ 400 + | +
+ Senior Notes R, 6.125% fixed-rate, due October 2039 + | ++ 600 + | ++ 600 + | +
+ Senior Notes Z, 6.45% fixed-rate, due September 2040 + | ++ 600 + | ++ 600 + | +
+ Senior Notes BB, 5.95% fixed-rate, due February 2041 + | ++ 750 + | ++ 750 + | +
+ Senior Notes DD, 5.70% fixed-rate, due February 2042 + | ++ 600 + | ++ 600 + | +
+ Senior Notes EE, 4.85% fixed-rate, due August 2042 + | ++ 750 + | ++ 750 + | +
+ Senior Notes GG, 4.45% fixed-rate, due February 2043 + | ++ 1,100 + | ++ 1,100 + | +
+ Senior Notes II, 4.85% fixed-rate, due March 2044 + | ++ 1,400 + | ++ 1,400 + | +
+ Senior Notes KK, 5.10% fixed-rate, due February 2045 + | ++ 1,150 + | ++ 1,150 + | +
+ Senior Notes QQ, 4.09% fixed-rate, due May 2046 + | ++ 975 + | ++ 975 + | +
+ Senior Notes UU, 4.25% fixed-rate, due February 2048 + | ++ 1,250 + | ++ 1,250 + | +
+ Senior Notes XX, 4.80% fixed-rate, due February 2049 + | ++ 1,250 + | ++ 1,250 + | +
+ Senior Notes ZZ, 4.20% fixed-rate, due January 2051 + | ++ 1,250 + | ++ 1,250 + | +
+ Senior Notes BB, 3.70% fixed-rate, due January 2051 + | ++ 1,000 + | ++ 1,000 + | +
+ Senior Notes DDD, 3.30% fixed-rate, due February 2052 + | ++ 1,000 + | ++ 1,000 + | +
+ Senior Notes EEE, 3.00% fixed-rate, due February 2053 + | ++ 1,000 + | ++ 1,000 + | +
+ Senior Notes NN, 4.95% fixed-rate, due October 2054 + | ++ 400 + | ++ 400 + | +
+ Senior Notes CCC, 3.95% fixed-rate, due January 2060 + | ++ 1,000 + | ++ 1,000 + | +
+ Total principal amount of senior debt obligations + | ++ 26,630 + | ++ 26,270 + | +
+ + EPO Junior Subordinated Notes C, variable-rate, due June 2067 + + + (3)(7) + + |
+ + 232 + | ++ 232 + | +
+ + EPO Junior Subordinated Notes D, variable-rate, due August 2077 + + + (4)(7) + + |
+ + 350 + | ++ 350 + | +
+ + EPO Junior Subordinated Notes E, fixed/variable-rate, due August 2077 + + + (5)(7) + + |
+ + 1,000 + | ++ 1,000 + | +
+ + EPO Junior Subordinated Notes F, fixed/variable-rate, due February 2078 + + + (6)(7) + + |
+ + 700 + | ++ 700 + | +
+ + TEPPCO Junior Subordinated Notes, variable-rate, due June 2067 + + + (3)(7) + + |
+ + 14 + | ++ 14 + | +
+ Total principal amount of senior and junior debt obligations + | ++ 28,926 + | ++ 28,566 + | +
+ Other, non-principal amounts + | ++ | ++ | +
+ Less current maturities of debt + | ++ (279) + | ++ (271) + | +
+ Total long-term debt + | ++ (1,204) + | ++ (1,744) + | +
+ | +
+
+ $
+
+ + 27,443 + + |
+
+
+ $
+
+ + 26,551 + + |
+
+ See discussion below in “Variable Interest Rates” regarding the LIBOR replacement and LIBOR replacement rate. +
++ References to “TEPPCO” mean TEPPCO Partners, L.P. prior to its merger with one of our wholly owned subsidiaries in October 2009. +
+ + 14 + + ++ The following table presents the range of interest rates and weighted-average interest rates paid on our consolidated variable-rate debt during the six months ended June 30, 2023: +
++ Range of Interest Rates Paid + | ++ Weighted-Average Interest Rate Paid + | +|
---|---|---|
+ Commercial Paper Notes + | ++ 4.59% to 5.34% + | ++ 5.17% + | +
+ EPO Junior Subordinated Notes C and TEPPCO Junior Subordinated Notes + | ++ 7.54% to 8.27% + | ++ 7.76% + | +
+ EPO Junior Subordinated Notes D + | ++ 7.63% to 8.30% + | ++ 7.91% + | +
+ Amounts borrowed under EPO’s March 2023 $1.5 Billion 364-Day Revolving Credit Agreement and March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement bear interest, at EPO’s election, equal to: (i) the Secured Overnight Financing Rate (“SOFR”), plus an additional variable spread; or (ii) an alternate base rate, which is the greatest of (a) the Prime Rate in effect on such day, (b) the Federal Funds Effective Rate in effect on such day plus 0.5%, or (c) Adjusted Term SOFR, for an interest period of one month in effect on such day plus 1%, and a variable spread. The applicable spreads are determined based on EPO’s debt ratings. +
++ In July 2017, the Financial Conduct Authority in the U.K. announced a desire to phase out LIBOR as a benchmark by the end of June 2023. In December 2022, the Board of Governors of the Federal Reserve System approved a final rule to implement the Adjustable Interest Rate (LIBOR) Act, which established benchmark replacements for certain contracts that reference various tenors of LIBOR and do not provide an alternative rate or would result in a rate that is expressed in terms of the last known value of LIBOR (typically referred to as a “frozen LIBOR” provision). The final rule became effective during the first quarter of 2023. As a result of the LIBOR Act, our Junior Subordinated Notes C and D and the TEPPCO Junior Subordinated Notes, which were subject to a variable rate (as defined by the applicable agreement) based on three-month LIBOR (in each case, a “LIBOR Rate”) through June 30, 2023, replaced the applicable LIBOR Rate with a variable rate based on the three-month CME Term SOFR (“SOFR Rate”) as administered by the CME Group Benchmark Administration, Ltd. plus a 0.26161% tenor spread adjustment beginning on July 1, 2023. Additionally, our Junior Subordinated Notes D and F, which would have been subject to a variable rate (as defined by the applicable agreement) based on three-month LIBOR beginning in August 2027 and February 2028, respectively, will replace the applicable LIBOR Rate with the three-month SOFR rate plus a 0.26161% tenor spread adjustment. The foregoing tenor spread adjustment will be in addition to the applicable spread under the terms of each series of Junior Subordinated Notes. We do not expect the transition from LIBOR to have a material financial impact on us. +
++ The following table presents the scheduled maturities of principal amounts of EPO’s consolidated debt obligations at June 30, 2023 for the next five years, and in total thereafter: +
++ Scheduled Maturities of Debt + | +|||||||
---|---|---|---|---|---|---|---|
+ Total + | ++ Remainder of 2023 + | ++ 2024 + | ++ 2025 + | ++ 2026 + | ++ 2027 + | ++ Thereafter + | +|
+ Commercial Paper Notes + | ++ $ 355 + | ++ $ 355 + | ++ $ — + | ++ $ — + | ++ $ — + | ++ $ — + | ++ $ — + | +
+ Senior Notes + | ++ $ 26,275 + | ++ $ — + | ++ $ 850 + | ++ $ 1,150 + | ++ $ 1,625 + | ++ $ 575 + | ++ $ 22,075 + | +
+ Junior Subordinated Notes + | ++ $ 2,296 + | ++ $ — + | ++ $ — + | ++ $ — + | ++ $ — + | ++ $ — + | ++ $ 2,296 + | +
+ Total + | ++ $ 28,926 + | ++ $ 355 + | ++ $ 850 + | ++ $ 1,150 + | ++ $ 1,625 + | ++ $ 575 + | ++ $ 24,371 + | +
+ In March 2023, EPO entered into a new 364-Day Revolving Credit Agreement (the “March 2023 $1.5 Billion 364-Day Revolving Credit Agreement”) that replaced its September 2022 364-Day Revolving Credit Agreement. There were no principal amounts outstanding under the September 2022 364-Day Revolving Credit Agreement when it was replaced by the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement. As of June 30, 2023, there were no principal amounts outstanding under the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement. +
++ Under the terms of the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement, EPO may borrow up to $1.5 billion (which may be increased by up to $200 million to $1.7 billion at EPO’s election, provided certain conditions are met) at a variable interest rate for a term of up to 364 days, subject to the terms and conditions set forth therein. The March 2023 $1.5 Billion 364-Day Revolving Credit Agreement matures in March 2024. To the extent that principal amounts are outstanding at the maturity date, EPO may elect to have the entire principal balance then outstanding continued as non-revolving term loans for a period of one additional year, payable in March 2025. Borrowings under the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement may be used for working capital, capital expenditures, acquisitions and general company purposes. +
++ The March 2023 $1.5 Billion 364-Day Revolving Credit Agreement contains customary representations, warranties, covenants (affirmative and negative) and events of default, the occurrence of which would permit the lenders to accelerate the maturity date of any amounts borrowed under this credit agreement. The March 2023 $1.5 Billion 364-Day Revolving Credit Agreement also restricts EPO’s ability to pay cash distributions to the Partnership, if an event of default (as defined in the credit agreement) has occurred and is continuing at the time such distribution is scheduled to be paid or would result therefrom. +
++ EPO’s obligations under the March 2023 $1.5 Billion 364-Day Revolving Credit Agreement are not secured by any collateral; however, they are guaranteed by the Partnership. +
++ In March 2023, EPO entered into a new revolving credit agreement that matures in March 2028 (the “March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement”). The March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement replaced EPO’s prior multi-year revolving credit agreement that was scheduled to mature in September 2026. There were no principal amounts outstanding under the prior multi-year revolving credit agreement when it was replaced by the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement. As of June 30, 2023, there were no principal amounts outstanding under the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement. +
++ Under the terms of the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement, EPO may borrow up to $2.7 billion (which may be increased by up to $500 million to $3.2 billion at EPO’s election, provided certain conditions are met) at a variable interest rate for a term of five years, subject to the terms and conditions set forth therein. The March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement matures in March 2028, although the maturity date may be extended at EPO’s request (up to two requests) for a one-year extension of the maturity date by delivering a request prior to the maturity date and with the consent of required lenders as set forth under the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement. Borrowings under the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement may be used for working capital, capital expenditures, acquisitions and general company purposes. +
++ The March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement contains customary representations, warranties, covenants (affirmative and negative) and events of default, the occurrence of which would permit the lenders to accelerate the maturity date of any amounts borrowed under this credit agreement. The March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement also restricts EPO’s ability to pay cash distributions to the Partnership, if an event of default (as defined in the credit agreement) has occurred and is continuing at the time such distribution is scheduled to be paid or would result therefrom. +
++ EPO’s obligations under the March 2023 $2.7 Billion Multi-Year Revolving Credit Agreement are not secured by any collateral; however, they are guaranteed by the Partnership. +
++ In January 2023, EPO issued $1.75 billion aggregate principal amount of senior notes comprised of (i) $750 million principal amount of senior notes due January 2026 (“Senior Notes FFF”) and (ii) $1.0 billion principal amount of senior notes due January 2033 (“Senior Notes GGG”). Net proceeds from this offering were used by EPO for general company purposes, including for growth capital investments, and the repayment of debt (including the repayment of all of our $1.25 billion principal amount of 3.35% Senior Notes HH at their maturity in March 2023 and amounts outstanding under our commercial paper program). +
++ Senior Notes FFF were issued at 99.893% of their principal amount and have a fixed-rate interest rate of 5.05% per year. Senior Notes GGG were issued at 99.803% of their principal amount and have a fixed-rate interest rate of 5.35% per year. The Partnership guaranteed these senior notes through an unconditional guarantee on an unsecured and unsubordinated basis. +
++ At June 30, 2023, EPO had $110 million of letters of credit outstanding primarily related to our commodity hedging activities. +
++ We were in compliance with the financial covenants of our consolidated debt agreements at June 30, 2023. +
++ The Partnership acts as guarantor of the consolidated debt obligations of EPO, with the exception of the remaining debt obligations of TEPPCO. If EPO were to default on any of its guaranteed debt, the Partnership would be responsible for full and unconditional repayment of such obligations. +
++ The following table summarizes changes in the number of our common units outstanding since December 31, 2022: +
++ Common units outstanding at December 31, 2022 + | ++ | +
---|---|
+ Common unit repurchases under 2019 Buyback Program + | ++ (682,589) + | +
+ Common units issued in connection with the vesting of phantom unit awards, net + | ++ 4,364,301 + | +
+ Other + | ++ 20,892 + | +
+ Common units outstanding at March 31, 2023 + | ++ 2,174,508,951 + | +
+ Common unit repurchases under 2019 Buyback Program + | ++ (2,910,121) + | +
+ Common units issued in connection with the vesting of phantom unit awards, net + | ++ 153,502 + | +
+ Common units outstanding at June 30, 2023 + | ++ 2,171,752,332 + | +
+ We have a universal shelf registration statement on file with the SEC which allows the Partnership and EPO (each on a standalone basis) to issue an unlimited amount of equity and debt securities, respectively. +
++ In addition, the Partnership has a registration statement on file with the SEC covering the issuance of up to $2.5 billion of its common units in amounts, at prices and on terms based on market conditions and other factors at the time of such offerings (referred to as the Partnership’s at-the-market (“ATM”) program). The Partnership did not issue any common units under its ATM program during the six months ended June 30, 2023. The Partnership’s capacity to issue additional common units under the ATM program remains at $2.5 billion as of June 30, 2023. +
++ We may issue additional equity and debt securities to assist us in meeting our future liquidity requirements, including those related to capital investments. +
+ ++ Table of Contents +
+ + 68 Prince Street Palmdale, CA 93550 + + + www.google.com + + + More text + ++ Header 1 + | ++ Header 2 + | +
---|---|
+ Row 1, Cell 1 + | ++ Row 1, Cell 2 + | +
+ Row 2, Cell 1 + | ++ Row 2, Cell 2 + | +
+ Big Table Header + | +||
---|---|---|
+ Merged Cell 1 + | ++ Cell 2 + | ++ Cell 3 + | +
+ Merged Cell 4 and 5 + | +||
+ Cell 6 + | ++ Cell 7 + | ++ Cell 8 + | +
+ Cell 9 + | ++ A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. + | +|
+ Cell 10 + | ++ Cell 11 + | ++ Cell 12 + | +
+ Chapter + | ++ Title + | ++ Page + | +
---|---|---|
+ 1 + | ++ Introduction + | ++ 1 + | +
+ 2 + | ++ Getting Started + | ++ 5 + | +
+ 3 + | ++ Basic Concepts + | ++ 12 + | +
+ 4 + | ++ Advanced Topics + | ++ 25 + | +
+ 5 + | ++ Conclusion + | ++ 40 + | +