Skip to content

Commit

Permalink
[import-external-reference] Fully change the library to generate PDF …
Browse files Browse the repository at this point in the history
…/ MD, accept cookies (#1866)
  • Loading branch information
SamuelHassine committed Sep 8, 2024
1 parent a5650fa commit 7808beb
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 52 deletions.
9 changes: 3 additions & 6 deletions internal-enrichment/import-external-reference/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,10 @@ COPY src /opt/opencti-connector-import-external-reference
# Install Python modules
# hadolint ignore=DL3003
RUN apt-get update && \
apt-get install -y git build-essential libmagic-dev libffi-dev libxml2-dev libxslt-dev libssl-dev cargo libjpeg-dev zlib1g-dev && \
ARCH=`echo -n $(uname -m) | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/'` && \
wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6.1-2/wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \
apt-get install -y ./wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \
rm wkhtmltox_0.12.6.1-2.bullseye_${ARCH}.deb && \
apt-get install -y git build-essential libmagic-dev libffi-dev libxml2-dev libxslt-dev libssl-dev cargo libjpeg-dev zlib1g-dev libxkbcommon0 libgbm1 && \
cd /opt/opencti-connector-import-external-reference && \
pip3 install --no-cache-dir -r requirements.txt
pip3 install --no-cache-dir -r requirements.txt && \
playwright install

# Expose and entrypoint
COPY entrypoint.sh /
Expand Down
4 changes: 2 additions & 2 deletions internal-enrichment/import-external-reference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ OpenCTI data is coming from *import* connectors.
| `connector_log_level` | `CONNECTOR_LOG_LEVEL` | Yes | Connector logging verbosity, could be `debug`, `info`, `warn` or `error` (less verbose). |
| `import_external_reference_import_as_pdf` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_PDF` | Yes | Import as PDF file |
| `import_external_reference_import_as_md` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_MD` | Yes | Import as MD file |
| `import_external_reference_import_pdf_as_md` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD` | Yes | If import_as_md is true, try to convert PDF as Markdown |
| `wkhtmltopdf_path` | `WKHTMLTOPDF_PATH` | No | If the environment variable does not work, set the wkhtmltopdf installation path here |
| `import_external_reference_import_pdf_as_md` | `IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD` | Yes | If import_as_md is true, try to convert PDF as Markdown |

After adding the connector, you should be able to extract information from a report.

*Reference: https://docs.oasis-open.org/cti/stix/v2.1/cs01/stix-v2.1-cs01.html*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@ services:
- IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_PDF=true # Import as PDF file
- IMPORT_EXTERNAL_REFERENCE_IMPORT_AS_MD=true # Import as MarkDown file
- IMPORT_EXTERNAL_REFERENCE_IMPORT_PDF_AS_MD=true # If import_as_md is true, try to convert PDF as Markdown
- WKHTMLTOPDF_PATH=ChangeMe
restart: always
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@ connector:
import_external_reference:
import_as_pdf: true # Import as PDF file
import_as_md: true # Import as MarkDown file
import_pdf_as_md: true # If import_as_md is true, try to convert PDF as Markdown
wkhtmltopdf_path: ChangeMe # Optional - Example 'C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe'
import_pdf_as_md: true # If import_as_md is true, try to convert PDF as Markdown
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from typing import Dict

import html2text
import pdfkit
import yaml
from pdfminer.converter import HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from playwright.sync_api import sync_playwright
from pycti import OpenCTIConnectorHelper, get_config_variable


Expand Down Expand Up @@ -80,41 +80,42 @@ def _process_external_reference(self, external_reference):
mime_type="application/pdf",
)
else:
try:
file_name = url_to_import.split("/")[-1] + ".pdf"
options = {
"javascript-delay": 10000,
"load-error-handling": "skip",
"custom-header": [
(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64)",
),
],
}
if self.wkhtmltopdf_path:
config = pdfkit.configuration(
wkhtmltopdf=self.wkhtmltopdf_path
file_name = url_to_import.split("/")[-1] + ".pdf"
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url_to_import)
page.wait_for_timeout(2000)
found = False
try:
page.locator('a:has-text("accept all")').click(
timeout=1000, force=True
)
data = pdfkit.from_url(
url_to_import,
False,
options=options,
configuration=config,
)
else:
data = pdfkit.from_url(
url_to_import, False, options=options
)
self.helper.api.external_reference.add_file(
id=external_reference["id"],
file_name=file_name,
data=data,
mime_type="application/pdf",
)
except OSError as e:
if "Done" not in str(e):
raise e
found = True
except:
pass
if not found:
try:
page.locator('button:has-text("accept all")').click(
timeout=1000, force=True
)
found = True
except:
pass
if found:
page.wait_for_timeout(2000)
page.pdf(format="A4", path="data.pdf")
browser.close()
with open("./data.pdf", "rb") as file:
data = file.read()
self.helper.api.external_reference.add_file(
id=external_reference["id"],
file_name=file_name,
data=data,
mime_type="application/pdf",
)
self.delete_files()

except Exception as e:
raise ValueError(e)
if self.import_as_md:
Expand Down Expand Up @@ -178,11 +179,31 @@ def _process_external_reference(self, external_reference):
text_maker.inline_links = True
text_maker.protect_links = True
text_maker.mark_code = True
req = urllib.request.Request(url_to_import, headers=self.headers)
response = urllib.request.urlopen(
req, context=ssl.create_default_context()
)
html = response.read().decode("utf-8")
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url_to_import)
page.wait_for_timeout(2000)
found = False
try:
page.locator('a:has-text("accept all")').click(
timeout=1000, force=True
)
found = True
except:
pass
if not found:
try:
page.locator('button:has-text("accept all")').click(
timeout=1000, force=True
)
found = True
except:
pass
if found:
page.wait_for_timeout(2000)
html = page.content()
browser.close()
data = text_maker.handle(html)
data = data.replace("](//", "](https://")
self.helper.api.external_reference.add_file(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ pycti==6.2.18
weasyprint==62.3
html2text==2024.2.26
pdfminer.six==20240706
pdfkit==1.0.0
playwright==1.46.0

0 comments on commit 7808beb

Please sign in to comment.