diff --git a/internal-enrichment/import-external-reference/Dockerfile b/internal-enrichment/import-external-reference/Dockerfile index e59909389..b0a56b336 100644 --- a/internal-enrichment/import-external-reference/Dockerfile +++ b/internal-enrichment/import-external-reference/Dockerfile @@ -7,13 +7,10 @@ COPY src /opt/opencti-connector-import-external-reference # Install Python modules # hadolint ignore=DL3003 RUN apt-get update && \ - apt-get install -y git build-essential libmagic-dev libffi-dev libxml2-dev libxslt-dev libssl-dev cargo libjpeg-dev zlib1g-dev && \ - ARCH=`echo -n $(uname -m) | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/'` && \ - wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6.1-2/wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \ - apt-get install -y ./wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \ - rm wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \ + apt-get install -y git build-essential libmagic-dev libffi-dev libxml2-dev libxslt-dev libssl-dev cargo libjpeg-dev zlib1g-dev libxkbcommon0 libgbm1 && \ cd /opt/opencti-connector-import-external-reference && \ - pip3 install --no-cache-dir -r requirements.txt + pip3 install --no-cache-dir -r requirements.txt && \ + playwright install # Expose and entrypoint COPY entrypoint.sh / diff --git a/internal-enrichment/import-external-reference/README.md b/internal-enrichment/import-external-reference/README.md index e6731d948..c4ef2a429 100644 --- a/internal-enrichment/import-external-reference/README.md +++ b/internal-enrichment/import-external-reference/README.md @@ -27,8 +27,8 @@ OpenCTI data is coming from *import* connectors. | `connector_log_level` | `CONNECTOR_LOG_LEVEL` | Yes | Connector logging verbosity, could be `debug`, `info`, `warn` or `error` (less verbose). | | `import_external_reference_import_as_pdf` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_PDF` | Yes | Import as PDF file | | `import_external_reference_import_as_md` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_MD` | Yes | Import as MD file | -| `import_external_reference_import_pdf_as_md` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD` | Yes | If import_as_md is true, try to convert PDF as Markdown | -| `wkhtmltopdf_path` | `WKHTMLTOPDF_PATH` | No | If the environment variable does not work, set the wkhtmltopdf installation path here | +| `import_external_reference_import_pdf_as_md` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD` | Yes | If import_as_md is true, try to convert PDF as Markdown | + After adding the connector, you should be able to extract information from a report. *Reference: https://docs.oasis-open.org/cti/stix/v2.1/cs01/stix-v2.1-cs01.html* diff --git a/internal-enrichment/import-external-reference/docker-compose.yml b/internal-enrichment/import-external-reference/docker-compose.yml index d9e2ba4f8..9212cb7e8 100644 --- a/internal-enrichment/import-external-reference/docker-compose.yml +++ b/internal-enrichment/import-external-reference/docker-compose.yml @@ -14,5 +14,4 @@ services: - IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_PDF=true # Import as PDF file - IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_MD=true # Import as MarkDown file - IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD=true # If import_as_md is true, try to convert PDF as Markdown - - WKHTMLTOPDF_PATH=ChangeMe restart: always diff --git a/internal-enrichment/import-external-reference/src/config.yml.sample b/internal-enrichment/import-external-reference/src/config.yml.sample index 39735ace0..e0cb2331b 100644 --- a/internal-enrichment/import-external-reference/src/config.yml.sample +++ b/internal-enrichment/import-external-reference/src/config.yml.sample @@ -14,5 +14,4 @@ connector: import_external_reference: import_as_pdf: true # Import as PDF file import_as_md: true # Import as MarkDown file - import_pdf_as_md: true # If import_as_md is true, try to convert PDF as Markdown - wkhtmltopdf_path: ChangeMe # Optional - Example 'C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe' \ No newline at end of file + import_pdf_as_md: true # If import_as_md is true, try to convert PDF as Markdown \ No newline at end of file diff --git a/internal-enrichment/import-external-reference/src/import-external-reference.py b/internal-enrichment/import-external-reference/src/import-external-reference.py index 1c5fa080a..0ab4eeb8d 100644 --- a/internal-enrichment/import-external-reference/src/import-external-reference.py +++ b/internal-enrichment/import-external-reference/src/import-external-reference.py @@ -4,12 +4,12 @@ from typing import Dict import html2text -import pdfkit import yaml from pdfminer.converter import HTMLConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage +from playwright.sync_api import sync_playwright from pycti import OpenCTIConnectorHelper, get_config_variable @@ -80,41 +80,42 @@ def _process_external_reference(self, external_reference): mime_type="application/pdf", ) else: - try: - file_name = url_to_import.split("/")[-1] + ".pdf" - options = { - "javascript-delay": 10000, - "load-error-handling": "skip", - "custom-header": [ - ( - "User-Agent", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64)", - ), - ], - } - if self.wkhtmltopdf_path: - config = pdfkit.configuration( - wkhtmltopdf=self.wkhtmltopdf_path + file_name = url_to_import.split("/")[-1] + ".pdf" + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + page.goto(url_to_import) + page.wait_for_timeout(2000) + found = False + try: + page.locator('a:has-text("accept all")').click( + timeout=1000, force=True ) - data = pdfkit.from_url( - url_to_import, - False, - options=options, - configuration=config, - ) - else: - data = pdfkit.from_url( - url_to_import, False, options=options - ) - self.helper.api.external_reference.add_file( - id=external_reference["id"], - file_name=file_name, - data=data, - mime_type="application/pdf", - ) - except OSError as e: - if "Done" not in str(e): - raise e + found = True + except: + pass + if not found: + try: + page.locator('button:has-text("accept all")').click( + timeout=1000, force=True + ) + found = True + except: + pass + if found: + page.wait_for_timeout(2000) + page.pdf(format="A4", path="data.pdf") + browser.close() + with open("./data.pdf", "rb") as file: + data = file.read() + self.helper.api.external_reference.add_file( + id=external_reference["id"], + file_name=file_name, + data=data, + mime_type="application/pdf", + ) + self.delete_files() + except Exception as e: raise ValueError(e) if self.import_as_md: @@ -178,11 +179,31 @@ def _process_external_reference(self, external_reference): text_maker.inline_links = True text_maker.protect_links = True text_maker.mark_code = True - req = urllib.request.Request(url_to_import, headers=self.headers) - response = urllib.request.urlopen( - req, context=ssl.create_default_context() - ) - html = response.read().decode("utf-8") + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + page.goto(url_to_import) + page.wait_for_timeout(2000) + found = False + try: + page.locator('a:has-text("accept all")').click( + timeout=1000, force=True + ) + found = True + except: + pass + if not found: + try: + page.locator('button:has-text("accept all")').click( + timeout=1000, force=True + ) + found = True + except: + pass + if found: + page.wait_for_timeout(2000) + html = page.content() + browser.close() data = text_maker.handle(html) data = data.replace("](//", "](https://") self.helper.api.external_reference.add_file( diff --git a/internal-enrichment/import-external-reference/src/requirements.txt b/internal-enrichment/import-external-reference/src/requirements.txt index d1067ffdd..5b13b80f9 100644 --- a/internal-enrichment/import-external-reference/src/requirements.txt +++ b/internal-enrichment/import-external-reference/src/requirements.txt @@ -2,4 +2,4 @@ pycti==6.2.18 weasyprint==62.3 html2text==2024.2.26 pdfminer.six==20240706 -pdfkit==1.0.0 +playwright==1.46.0 \ No newline at end of file