-
Notifications
You must be signed in to change notification settings - Fork 321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[omm] Hash api implementation first draft #1355
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
|
||
""" | ||
Accessors for various "global" resources, usually cached by request lifetime | ||
|
||
I can't tell if these should just be in app.py, so I'm sticking it here for now, | ||
since one advantage of putting these in functions is we can type the output. | ||
""" | ||
|
||
from flask import g | ||
|
||
from OpenMediaMatch.storage.interface import IUnifiedStore | ||
from OpenMediaMatch.storage.default import DefaultOMMStore | ||
|
||
|
||
def get_storage() -> IUnifiedStore: | ||
""" | ||
Get the storage object, which is just a wrapper around the real storage. | ||
""" | ||
if "storage" not in g: | ||
# dougneal, you'll need to eventually add constructor arguments | ||
# for this to pass in the postgres/database object. We're just | ||
# hiding flask bits from pytx bits | ||
g.storage = DefaultOMMStore() | ||
return g.storage |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,21 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
|
||
""" | ||
Endpoints for hashing content | ||
""" | ||
|
||
from pathlib import Path | ||
import tempfile | ||
import typing as t | ||
|
||
from flask import Blueprint | ||
from flask import abort, request | ||
from flask import abort, request, current_app | ||
import requests | ||
|
||
from threatexchange.content_type.content_base import ContentType | ||
from threatexchange.signal_type.signal_base import FileHasher, SignalType | ||
|
||
from OpenMediaMatch import app_resources | ||
|
||
bp = Blueprint("hashing", __name__) | ||
|
||
|
@@ -10,18 +26,65 @@ def hash_media(): | |
Fetch content and return its hash. | ||
TODO: implement | ||
""" | ||
|
||
content_type = _parse_request_content_type() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about we get the content type from the response headers (below) and simplify the interface? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Works for me - sometimes the content type cannot be determine from the URL, should we provide an optional param in that case? |
||
signal_types = _parse_request_signal_type(content_type) | ||
|
||
media_url = request.args.get("url", None) | ||
if media_url is None: | ||
# path is required, otherwise we don't know what we're hashing. | ||
# TODO: a more helpful message | ||
abort(400) | ||
|
||
hash_types = request.args.get("types", None) | ||
if hash_types is not None: | ||
# TODO: parse this into a list of hash types | ||
pass | ||
|
||
# TODO | ||
# - download the media | ||
# - decode it | ||
# - hash it | ||
abort(400, "url is required") | ||
|
||
download_resp = requests.get(media_url, allow_redirects=True, timeout=30 * 1000) | ||
download_resp.raise_for_status() | ||
|
||
ret = {} | ||
|
||
# For images, we may need to copy the file suffix (.png, jpeg, etc) for it to work | ||
with tempfile.NamedTemporaryFile("wb") as tmp: | ||
current_app.logger.debug("Writing to %s", tmp.name) | ||
tmp.write(download_resp.content) | ||
path = Path(tmp.name) | ||
for st in signal_types.values(): | ||
# At this point, every BytesHasher is a FileHasher, but we could | ||
# explicitly pull those out to avoiding storing any copies of | ||
# data locally, even temporarily | ||
if issubclass(st, FileHasher): | ||
ret[st.get_name()] = st.hash_from_file(path) | ||
return ret | ||
|
||
|
||
def _parse_request_content_type() -> ContentType: | ||
storage = app_resources.get_storage() | ||
arg = request.args.get("content_type", "") | ||
content_type_config = storage.get_content_type_configs().get(arg) | ||
if content_type_config is None: | ||
abort(400, f"no such content_type: '{arg}'") | ||
|
||
if not content_type_config.enabled: | ||
abort(400, f"content_type {arg} is disabled") | ||
|
||
return content_type_config.content_type | ||
|
||
|
||
def _parse_request_signal_type(content_type: ContentType) -> t.Mapping[str, SignalType]: | ||
storage = app_resources.get_storage() | ||
signal_types = storage.get_enabled_signal_types_for_content_type(content_type) | ||
if not signal_types: | ||
abort(500, "No signal types configured!") | ||
signal_type_args = request.args.get("types", None) | ||
if signal_type_args is None: | ||
return signal_types | ||
|
||
ret = {} | ||
for st_name in signal_type_args.split(","): | ||
st_name = st_name.strip() | ||
if not st_name: | ||
continue | ||
if st_name not in signal_types: | ||
abort(400, f"signal type '{st_name}' doesn't exist or is disabled") | ||
ret[st_name] = signal_types[st_name] | ||
|
||
if not ret: | ||
abort(400, "empty signal type selection") | ||
|
||
return ret |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why was this needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried running the flask CLI just to get a feel for it, and my run had an error, and I spent ~10 minutes learning this env variable needed to be populated, which I then realized for development will always just be this same value.
I changed the default for CLI is throw exception to default load this config.