Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove scipy #1835

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions compose/local/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,23 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
# set up makecert root CA
RUN curl http://localhost/rootCA.pem > /usr/local/share/ca-certificates/rootCA.crt && update-ca-certificates

# heroku cli
RUN curl https://cli-assets.heroku.com/install.sh | sh

# install NVM
ENV NVM_DIR /usr/local/nvm
ENV NODE_VERSION 18.16.0

RUN mkdir $NVM_DIR
RUN curl https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | bash \
&& . $NVM_DIR/nvm.sh \
&& nvm install $NODE_VERSION \
&& nvm alias default $NODE_VERSION \
&& nvm use default
&& . $NVM_DIR/nvm.sh \
&& nvm install $NODE_VERSION \
&& nvm alias default $NODE_VERSION \
&& nvm use default

ENV NODE_PATH $NVM_DIR/v$NODE_VERSION/lib/node_modules
ENV PATH $NVM_DIR/versions/node/v$NODE_VERSION/bin:$PATH

# heroku cli
RUN curl https://cli-assets.heroku.com/install.sh | sh

# Requirements are installed here to ensure they will be cached.
COPY ./pip /pip
#RUN --mount=type=cache,target=/root/.cache/pip pip install -r /pip/requirements.txt
Expand Down
Binary file removed muckrock/foia/classifier.pkl
Binary file not shown.
47 changes: 0 additions & 47 deletions muckrock/foia/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@

# Third Party
import boto3
import dill as pickle
import lob
import numpy as np
import requests
from anymail.exceptions import AnymailError
from constance import config
Expand All @@ -46,7 +44,6 @@
from phaxio.exceptions import PhaxioError
from raven import Client
from raven.contrib.celery import register_logger_signal, register_signal
from scipy.sparse import hstack
from zipstream import ZIP_DEFLATED, ZipFile

# MuckRock
Expand Down Expand Up @@ -389,32 +386,6 @@ def get_text_ocr(doc_id):

return document.full_text

def get_classifier():
"""Load the pickled classifier"""
with open("muckrock/foia/classifier.pkl", "rb") as pkl_fp:
return pickle.load(pkl_fp)

def predict_status(vectorizer, selector, classifier, text, pages):
"""Run the prediction"""
input_vect = vectorizer.transform([text])
pages_vect = np.array([pages], dtype=np.float).transpose()
input_vect = hstack([input_vect, pages_vect])
input_vect = selector.transform(input_vect)
probs = classifier.predict_proba(input_vect)[0]
max_prob = max(probs)
status = classifier.classes_[list(probs).index(max_prob)]
return status, max_prob

def resolve_if_possible(resp_task):
"""Resolve this response task if possible based off of ML setttings"""
if config.ENABLE_ML and resp_task.status_probability >= config.CONFIDENCE_MIN:
try:
ml_robot = User.objects.get(username="mlrobot")
resp_task.set_status(resp_task.predicted_status)
resp_task.resolve(ml_robot, {"status": resp_task.predicted_status})
except User.DoesNotExist:
logger.error("mlrobot account does not exist")

def resolve_gloo_if_possible(resp_task, extracted_data):
"""Resolve this response task if possible based off of ML setttings"""

Expand Down Expand Up @@ -471,31 +442,13 @@ def resolve_gloo_if_possible(resp_task, extracted_data):
# wait longer for document cloud
classify_status.retry(countdown=60 * 30, args=[task_pk], kwargs=kwargs)

# old classify
full_text = resp_task.communication.communication + (" ".join(file_text))
vectorizer, selector, classifier = get_classifier()

status, prob = predict_status(
vectorizer, selector, classifier, full_text, total_pages
)

if not (config.ENABLE_GLOO and config.USE_GLOO):
resp_task.predicted_status = status
resp_task.status_probability = int(100 * prob)

resolve_if_possible(resp_task)

resp_task.save()

# new classify
if config.ENABLE_GLOO:
try:
extracted_data, status = asyncio.run(
process_request(
resp_task.communication.communication,
"\n\n".join(file_text),
mlrobot_status=status,
mlrobot_prob=str(int(100 * prob)),
task_url=settings.MUCKROCK_URL + resp_task.get_absolute_url(),
request_url=settings.MUCKROCK_URL
+ resp_task.communication.foia.get_absolute_url(),
Expand Down
22 changes: 19 additions & 3 deletions muckrock/foia/tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,41 @@

# Third Party
import nose.tools
from constance.test import override_config
from mock import Mock, patch

# MuckRock
from muckrock.core.factories import UserFactory
from muckrock.foia.factories import FOIACommunicationFactory
from muckrock.foia.tasks import classify_status
from muckrock.task.factories import ResponseTaskFactory


@override_config(ENABLE_GLOO=True, USE_GLOO=True)
class TestFOIAClassify(TestCase):
"""Test the classification of a new communication"""

@patch("asyncio.run", Mock())
@patch(
"asyncio.run",
Mock(
return_value=(
Mock(
trackingNumber=None,
price=None,
dateEstimate=None,
),
"processed",
)
),
)
def test_classifier(self):
"""Classifier should populate the fields on the response task"""
UserFactory(username="gloo")
comm = FOIACommunicationFactory(
communication="Here are your responsive documents"
)
task = ResponseTaskFactory(communication=comm)
classify_status.apply(args=(task.pk,), throw=True)
task.refresh_from_db()
nose.tools.ok_(task.predicted_status)
nose.tools.ok_(task.status_probability)
nose.tools.eq_(task.predicted_status, "processed")
nose.tools.ok_(task.resolved)
4 changes: 0 additions & 4 deletions pip/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ bleach # Used to sanitize any HTML we're rendering
boto3 # Used to access AWS
celery # Used to run async tasks
chardet # Detect character encodings for user uploaded text files
dill # Used to serialize machine learning data
django-activity-stream # Used for notifications
django-anymail[mailgun] # Use for sending email on production
django-autocomplete-light==3.9.0rc5 # Autocomplete drop down inputs
Expand Down Expand Up @@ -52,7 +51,6 @@ lob # sending mail via lob.com
markdown # Used for rendering Markdown, obviously!
memoize # cachable properties
newrelic # Interface to newrelic service
numpy # Used by machine learning
pandas # Used by gloo
pdfrw # Used for reading/writing PDFs for form filling in
phonenumberslite # Library for validating and formatting phone numbers
Expand All @@ -75,8 +73,6 @@ redis # Redis integration - for use with celery
reportlab # Used for adding text to PDFs for form filling in
requests # HTTP for humans
rules # Rule based permissions
scikit-learn # Used for machine learning
scipy # Used for machine learning
scout-apm # performance monitoring
simplejson # json decoder for requests
smart-open # Use for streaming files from S3
Expand Down
8 changes: 1 addition & 7 deletions pip/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ cssselect==0.9.2 # via premailer
cssutils==2.7.1 # via premailer
decorator==4.3.0 # via ipython, traitlets
defusedxml==0.7.0rc1 # via python3-openid, social-auth-core
dill==0.3.2 # via -r pip/requirements.in
django==4.2 # via -r pip/requirements.in, django-activity-stream, django-anymail, django-appconf, django-celery-email, django-choices, django-cors-headers, django-debug-toolbar, django-extensions, django-filter, django-hijack, django-localflavor, django-news-sitemaps, django-opensearch, django-phonenumber-field, django-picklefield, django-redis, django-reversion, django-sslify, django-storages, django-taggit, djangorestframework, dogslow, drf-nested-routers, easy-thumbnails, jsonfield
django-activity-stream==1.4.2 # via -r pip/requirements.in
django-anymail[mailgun]==9.1 # via -r pip/requirements.in
Expand Down Expand Up @@ -90,7 +89,6 @@ ipython==7.16.1 # via -r pip/requirements.in
ipython-genutils==0.2.0 # via traitlets
jedi==0.17.1 # via ipython
jmespath==0.10.0 # via boto3, botocore
joblib==0.16.0 # via scikit-learn
jsonfield==2.0.2 # via -r pip/requirements.in
kombu==4.6.11 # via celery
listcrunch==1.0.1 # via python-documentcloud
Expand All @@ -100,7 +98,7 @@ markdown==3.2.2 # via -r pip/requirements.in, pymdown-extensions
memoize==1.0.0 # via -r pip/requirements.in
multidict==6.0.4 # via aiohttp, yarl
newrelic==2.70.0.51 # via -r pip/requirements.in
numpy==1.19.0 # via -r pip/requirements.in, pandas, scikit-learn, scipy
numpy==1.24.4 # via pandas
oauth2client==4.1.2 # via google-api-python-client
oauthlib==2.1.0 # via requests-oauthlib, social-auth-core
openai==0.28.0 # via gloo-lib
Expand Down Expand Up @@ -159,8 +157,6 @@ rjsmin==1.2.1 # via django-compressor
rsa==3.4.2 # via oauth2client, python-jose
rules==2.2 # via -r pip/requirements.in
s3transfer==0.6.1 # via boto3
scikit-learn==0.23.1 # via -r pip/requirements.in
scipy==1.5.0 # via -r pip/requirements.in, scikit-learn
scout-apm==2.16.2 # via -r pip/requirements.in
scrapelib==2.2.0 # via govqa
simplejson==3.16.0 # via -r pip/requirements.in
Expand All @@ -172,15 +168,13 @@ social-auth-core[openidconnect]==4.1.0 # via -r pip/requirements.in, social-aut
sorl-thumbnail==12.9.0 # via -r pip/requirements.in
sqlparse==0.4.4 # via django, django-debug-toolbar
stripe==1.75.0 # via -r pip/requirements.in
threadpoolctl==2.1.0 # via scikit-learn
tiktoken==0.5.1 # via -r pip/requirements.in
tomli==2.0.1 # via pytest
tqdm==4.66.1 # via openai
traitlets==4.3.2 # via ipython
types-requests==2.31.0.5 # via gloo-lib
types-urllib3==1.26.25.14 # via types-requests
typing-extensions==4.8.0 # via annotated-types, pydantic, pydantic-core, pypdf
tzdata==2023.3 # via pandas
unidecode==0.4.19 # via -r pip/requirements.in
uritemplate==3.0.0 # via google-api-python-client
urllib3[secure]==1.26.16 # via botocore, python-documentcloud, requests, scout-apm, scrapelib
Expand Down