[import-external-reference] Fully change the library to generate PDF …

…/ MD, accept cookies (#1866)
OpenCTI-Platform · Sep 8, 2024 · 7808beb · 7808beb
1 parent a5650fa
commit 7808beb
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 52 deletions.
diff --git a/internal-enrichment/import-external-reference/Dockerfile b/internal-enrichment/import-external-reference/Dockerfile
@@ -7,13 +7,10 @@ COPY src /opt/opencti-connector-import-external-reference
 # Install Python modules
 # hadolint ignore=DL3003
 RUN apt-get update && \
-    apt-get install -y git build-essential libmagic-dev libffi-dev libxml2-dev libxslt-dev libssl-dev cargo libjpeg-dev zlib1g-dev && \
-    ARCH=`echo -n $(uname -m) | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/'` && \
-        wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6.1-2/wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \
-        apt-get install -y ./wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \
-        rm wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \
+    apt-get install -y git build-essential libmagic-dev libffi-dev libxml2-dev libxslt-dev libssl-dev cargo libjpeg-dev zlib1g-dev libxkbcommon0 libgbm1  && \
     cd /opt/opencti-connector-import-external-reference && \
-    pip3 install --no-cache-dir -r requirements.txt
+    pip3 install --no-cache-dir -r requirements.txt && \
+    playwright install
 
 # Expose and entrypoint
 COPY entrypoint.sh /

diff --git a/internal-enrichment/import-external-reference/README.md b/internal-enrichment/import-external-reference/README.md
@@ -27,8 +27,8 @@ OpenCTI data is coming from *import* connectors.
 | `connector_log_level`                        | `CONNECTOR_LOG_LEVEL`                        | Yes       | Connector logging verbosity, could be `debug`, `info`, `warn` or `error` (less verbose). |
 | `import_external_reference_import_as_pdf`    | `IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_PDF`    | Yes       | Import as PDF file                                                                       |
 | `import_external_reference_import_as_md`     | `IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_MD`     | Yes       | Import as MD file                                                                        |
-| `import_external_reference_import_pdf_as_md` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD` | Yes       | If import_as_md is true, try to convert PDF as Markdown                                  | 
-| `wkhtmltopdf_path`                           | `WKHTMLTOPDF_PATH`                           | No        | If the environment variable does not work, set the wkhtmltopdf installation path here    |
+| `import_external_reference_import_pdf_as_md` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD` | Yes       | If import_as_md is true, try to convert PDF as Markdown                                  |
+
 After adding the connector, you should be able to extract information from a report.
 
 *Reference: https://docs.oasis-open.org/cti/stix/v2.1/cs01/stix-v2.1-cs01.html*

diff --git a/internal-enrichment/import-external-reference/docker-compose.yml b/internal-enrichment/import-external-reference/docker-compose.yml
@@ -14,5 +14,4 @@ services:
       - IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_PDF=true # Import as PDF file
       - IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_MD=true # Import as MarkDown file
       - IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD=true # If import_as_md is true, try to convert PDF as Markdown
-      - WKHTMLTOPDF_PATH=ChangeMe
     restart: always
diff --git a/internal-enrichment/import-external-reference/src/config.yml.sample b/internal-enrichment/import-external-reference/src/config.yml.sample
@@ -14,5 +14,4 @@ connector:
 import_external_reference:
   import_as_pdf: true # Import as PDF file
   import_as_md: true # Import as MarkDown file
-  import_pdf_as_md: true # If import_as_md is true, try to convert PDF as Markdown
-  wkhtmltopdf_path: ChangeMe # Optional - Example 'C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe'
+  import_pdf_as_md: true # If import_as_md is true, try to convert PDF as Markdown
diff --git a/internal-enrichment/import-external-reference/src/import-external-reference.py b/internal-enrichment/import-external-reference/src/import-external-reference.py
@@ -4,12 +4,12 @@
 from typing import Dict
 
 import html2text
-import pdfkit
 import yaml
 from pdfminer.converter import HTMLConverter
 from pdfminer.layout import LAParams
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
+from playwright.sync_api import sync_playwright
 from pycti import OpenCTIConnectorHelper, get_config_variable
 
 
@@ -80,41 +80,42 @@ def _process_external_reference(self, external_reference):
                         mime_type="application/pdf",
                     )
                 else:
-                    try:
-                        file_name = url_to_import.split("/")[-1] + ".pdf"
-                        options = {
-                            "javascript-delay": 10000,
-                            "load-error-handling": "skip",
-                            "custom-header": [
-                                (
-                                    "User-Agent",
-                                    "Mozilla/5.0 (Windows NT 6.1; Win64; x64)",
-                                ),
-                            ],
-                        }
-                        if self.wkhtmltopdf_path:
-                            config = pdfkit.configuration(
-                                wkhtmltopdf=self.wkhtmltopdf_path
+                    file_name = url_to_import.split("/")[-1] + ".pdf"
+                    with sync_playwright() as p:
+                        browser = p.chromium.launch()
+                        page = browser.new_page()
+                        page.goto(url_to_import)
+                        page.wait_for_timeout(2000)
+                        found = False
+                        try:
+                            page.locator('a:has-text("accept all")').click(
+                                timeout=1000, force=True
                             )
-                            data = pdfkit.from_url(
-                                url_to_import,
-                                False,
-                                options=options,
-                                configuration=config,
-                            )
-                        else:
-                            data = pdfkit.from_url(
-                                url_to_import, False, options=options
-                            )
-                        self.helper.api.external_reference.add_file(
-                            id=external_reference["id"],
-                            file_name=file_name,
-                            data=data,
-                            mime_type="application/pdf",
-                        )
-                    except OSError as e:
-                        if "Done" not in str(e):
-                            raise e
+                            found = True
+                        except:
+                            pass
+                        if not found:
+                            try:
+                                page.locator('button:has-text("accept all")').click(
+                                    timeout=1000, force=True
+                                )
+                                found = True
+                            except:
+                                pass
+                        if found:
+                            page.wait_for_timeout(2000)
+                        page.pdf(format="A4", path="data.pdf")
+                        browser.close()
+                    with open("./data.pdf", "rb") as file:
+                        data = file.read()
+                    self.helper.api.external_reference.add_file(
+                        id=external_reference["id"],
+                        file_name=file_name,
+                        data=data,
+                        mime_type="application/pdf",
+                    )
+                    self.delete_files()
+
             except Exception as e:
                 raise ValueError(e)
         if self.import_as_md:
@@ -178,11 +179,31 @@ def _process_external_reference(self, external_reference):
                     text_maker.inline_links = True
                     text_maker.protect_links = True
                     text_maker.mark_code = True
-                    req = urllib.request.Request(url_to_import, headers=self.headers)
-                    response = urllib.request.urlopen(
-                        req, context=ssl.create_default_context()
-                    )
-                    html = response.read().decode("utf-8")
+                    with sync_playwright() as p:
+                        browser = p.chromium.launch()
+                        page = browser.new_page()
+                        page.goto(url_to_import)
+                        page.wait_for_timeout(2000)
+                        found = False
+                        try:
+                            page.locator('a:has-text("accept all")').click(
+                                timeout=1000, force=True
+                            )
+                            found = True
+                        except:
+                            pass
+                        if not found:
+                            try:
+                                page.locator('button:has-text("accept all")').click(
+                                    timeout=1000, force=True
+                                )
+                                found = True
+                            except:
+                                pass
+                        if found:
+                            page.wait_for_timeout(2000)
+                        html = page.content()
+                        browser.close()
                     data = text_maker.handle(html)
                     data = data.replace("](//", "](https://")
                     self.helper.api.external_reference.add_file(

diff --git a/internal-enrichment/import-external-reference/src/requirements.txt b/internal-enrichment/import-external-reference/src/requirements.txt
@@ -2,4 +2,4 @@ pycti==6.2.18
 weasyprint==62.3
 html2text==2024.2.26
 pdfminer.six==20240706
-pdfkit==1.0.0
+playwright==1.46.0