Merge pull request #7 from kylase/feature/api

REST API for ParsCit
WING-NUS · Sep 17, 2018 · 0996021 · 0996021
2 parents f4ad0ba + 51e8957
commit 0996021
Show file tree

Hide file tree

Showing 21 changed files with 369 additions and 190 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,7 @@
 FROM python:2
 
-ENV ENVIRONMENT prod 
+ENV ENVIRONMENT prod
+ENV NUM_WORKERS 1
 
 WORKDIR /usr/src
 
@@ -12,3 +13,5 @@ RUN pip install --no-cache-dir Theano==1.0.2 numpy==1.14.5 gensim==3.5.0
 
 RUN echo "[global]\nfloatX = float32" >> ~/.theanorc
 RUN echo "[blas]\nldflags = -lblas -lgfortran" >> ~/.theanorc
+
+EXPOSE 8000
diff --git a/Predictor.py b/Predictor.py
diff --git a/README.md b/README.md
@@ -9,22 +9,30 @@ Neural ParsCit is a citation string parser which parses reference strings into i
 
 To use the tagger, you need Python 2.7, with Numpy, Theano and Gensim installed.
 
+You can use environmental variables to set the following:
+- `MODEL_PATH`: Path to the model's parameters
+- `WB_PATH`: Path to the word embeddings
+- `TIMEOUT`: Timeout for gunicorn when starting the Flask app. Increase this if you experience the Flask app is unable to start as the model building process takes too long. [Default: 60]
+- `NUM_WORKERS`: Number of workers which gunicorn spawns. [Default: 1]
+
 ### Using virtualenv in Linux systems
 
 ```
 virtualenv -ppython2.7 .venv
 source .venv/bin/activate
-pip install -r requirements.txt
+pip install -r requirements/<env>.txt
 ```
 
+Where `<env>` is `{prod, dev, test}`
+
 ### Using Docker
 
 1. Build the image: `docker build -t theano-gensim - < Dockerfile`
-1. Run the repo mounted to the container: `docker run -it -v /path/to/Neural-ParsCit:/usr/src --name np theano-gensim:latest /bin/bash`
+1. Run the repo mounted to the container: `docker run -it -v $(pwd):/usr/src --name np theano-gensim:latest /bin/bash`
 
 ## Word Embeddings
 
-The word embeddings do not come with this repository. You can obtain the [word embeddings without `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) (not recommended for v1.0.3) or [word embeddings with `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors_with_unk.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) (deprecated in v1.0.3 as the entire word vectors can be loaded with less memory) from WING website. Please read the next section on availability of `<UNK>` in word embeddings.
+The word embeddings do not come with this repository. You can obtain the [word embeddings with `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors_with_unk.tar.gz) from WING website. Please read the next section on availability of `<UNK>` in word embeddings.
 
 You will need to extract the content of the word embedding archive (`vectors_with_unk.tar.gz`) to the root directory for this repository by running `tar xfz vectors_with_unk.tar.gz`.
 
@@ -34,8 +42,11 @@ If the word embeddings provided do not have `<UNK>`, your instance will not bene
 
 Without `<UNK>`, at most 7.5 GB of memory is required as the entire word vectors need to be instantiated in memory to create the new matrix. Comparing with embeddings with `<UNK>`, which is much lower as it only requires at most 4.5 GB.
 
+
 ## Parse citation strings
 
+### Command Line
+
 The fastest way to use the parser is to run state-of-the-art pre-trained model as follows:
 
 ```
@@ -46,6 +57,17 @@ The script can run interactively or input can be passed in a file. In the intera
 
 The state-of-the-art trained model is provided in the models folder and is named neuralParsCit. The binary file for word embeddings is provided in the docker image of the current version of neural ParsCit. The hyper parameter ```discarded``` is the number of embeddings not used in our model. Retained words have a frequency of more than 0 in the ACM citation literature from 1994-2014.
 
+### Using a Web Server
+
+The web server (a Flask app) provides REST API.
+
+Running the web server,
+`docker run --rm -it -p 8000:8000 -e TIMEOUT=60 -v $(pwd):/usr/src --name np theano-gensim:latest /bin/bash`
+
+In the container, `gunicorn -b 0.0.0.0:8000 -w $NUM_WORKERS --timeout $TIMEOUT run_app:app`
+
+The REST API documentation can be found at `http//localhost:8000/docs`
+
 
 ## Train a model
 

diff --git a/app/__init__.py b/app/__init__.py
@@ -0,0 +1,66 @@
+import os
+import logging
+from flask import Flask, Blueprint, jsonify, g
+from flask_restful_swagger_2 import Api, get_swagger_blueprint
+from flask_swagger_ui import get_swaggerui_blueprint
+from app.resources.parscit import Parse, ParseBatch
+from utils import get_model
+
+
+def create_app(config):
+    """
+    Wrapper function for Flask app
+    params:
+        config: Config
+    """
+    app = Flask(__name__)
+    app.config.from_object(config)
+
+    model_path = os.path.abspath(os.getenv('MODEL_PATH',
+                                           default='models/neuralParscit/'))
+    word_emb_path = os.path.abspath(os.getenv('WORD_EMB_PATH',
+                                              default='vectors_with_unk.kv'))
+
+    with app.app_context():
+        logging.info("Loading model from {} and using word embeddings from {}".format(model_path, word_emb_path))
+        model, inference = get_model(model_path, word_emb_path)
+        setattr(app, 'model', model)
+        setattr(app, 'inference', inference)
+        setattr(app, 'word_to_id', {v:i for i, v in model.id_to_word.items()})
+        setattr(app, 'char_to_id', {v:i for i, v in model.id_to_char.items()})
+
+    API_DOC_PATH = '/docs'
+    SWAGGER_PATH = '/swagger'
+
+    api_bp = Blueprint('api', __name__)
+    api = Api(api_bp, add_api_spec_resource=False)
+    api.add_resource(Parse, '/parscit/parse')
+    api.add_resource(ParseBatch, '/parscit/parse/batch')
+
+    docs = [api.get_swagger_doc()]
+
+    swagger_ui_blueprint = get_swaggerui_blueprint(
+        API_DOC_PATH,
+        SWAGGER_PATH + '.json',
+        config={
+            'app_name': 'ParsCit API'
+        }
+    )
+
+    app.register_blueprint(api.blueprint)
+    app.register_blueprint(get_swagger_blueprint(docs, SWAGGER_PATH,
+                                                 title='ParsCit API',
+                                                 api_version='1.0',
+                                                 base_path='/'))
+    app.register_blueprint(swagger_ui_blueprint, url_prefix=API_DOC_PATH)
+
+    @app.errorhandler(404)
+    def not_found(error):
+        """
+        Handles URLs that are not specified
+        """
+        return jsonify({
+            'message': "API doesn't exist"
+        }), 404
+
+    return app
diff --git a/app/resources/__init__.py b/app/resources/__init__.py
diff --git a/app/resources/parscit.py b/app/resources/parscit.py
@@ -0,0 +1,99 @@
+from __future__ import print_function
+import numpy as np
+from flask import abort, current_app, g
+from flask_restful import reqparse
+from flask_restful_swagger_2 import swagger, Resource
+from app.resources.schemas import Entity, ParseResponse, ParseBatchResponse
+from app.utils import get_model
+from utils import create_input
+from loader import prepare_dataset
+
+class Parse(Resource):
+    """
+    """
+    parser = reqparse.RequestParser()
+    parser.add_argument('string', type=unicode, trim=True, required=True, location='json')
+    @swagger.doc({
+        'description': 'Parse a single string and return the associated entity for each token in the string.',
+        'reqparser': {
+            'name': 'Single Submission Request',
+            'parser': parser
+        },
+        'responses': {
+            '200': {
+                'description': 'Successfully parsed provided string.',
+                'schema': ParseResponse
+            }
+        }
+    })
+
+    def post(self):
+        """
+        Parse a single string and return the associated entity for each token in the string.
+        """
+        args = self.parser.parse_args()
+        ref_string = args.get('string')
+        if ref_string is None or ref_string == "":
+            # Hackish way as reqparse can't catch empty string
+            abort(400, description='string is empty or not provided.')
+
+        tokens = ref_string.split(" ")
+
+        data = prepare_dataset([[[token] for token in tokens]],
+                               current_app.word_to_id,
+                               current_app.char_to_id,
+                               current_app.model.parameters['lower'],
+                               True)
+
+        model_inputs = create_input(data[0], current_app.model.parameters, False)
+        y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1]
+        tags = [current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred))]
+
+        response = ParseResponse(reference_string=ref_string,
+                                 data=[Entity(term=term, entity=entity)
+                                       for term, entity in zip(tokens, tags)])
+        return response
+
+class ParseBatch(Resource):
+    parser = reqparse.RequestParser()
+    parser.add_argument('strings', type=unicode, action='append', required=True, location='json')
+    @swagger.doc({
+        'description': 'Parse multiple string and return the associated entity for each token in each string.',
+        'reqparser': {
+            'name': 'Mutliple Submission Request',
+            'parser': parser
+        },
+        'responses': {
+            '200': {
+                'description': 'Successfully parsed provided strings.',
+                'schema': ParseBatchResponse
+            }
+        }
+    })
+    def post(self):
+        """
+        Parse multiple string and return the associated entity for each token in each string.
+        """
+        args = self.parser.parse_args()
+        ref_strings = args.get('strings')
+
+        tokens = [[[token] for token in ref_string.split(" ")] for ref_string in ref_strings]
+        data = prepare_dataset(tokens,
+                               current_app.word_to_id,
+                               current_app.char_to_id,
+                               current_app.model.parameters['lower'],
+                               True)
+
+        tagged = []
+
+        for index, datum in enumerate(data):
+            model_inputs = create_input(datum, current_app.model.parameters, False)
+            y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1]
+            tags = [current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred))]
+
+            tagged.append([Entity(term=term, entity=entity)
+                           for term, entity in zip(ref_strings[index].split(" "), tags)])
+
+        response = ParseBatchResponse(reference_strings=ref_strings,
+                                      data=tagged)
+        return response