From 5cb84e6f0ff60a507226485e56d601a39e0b644c Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sat, 15 Sep 2018 19:26:45 +0800 Subject: [PATCH 01/10] Fixed some code style --- loader.py | 2 +- run.py | 4 ++-- utils.py | 17 +++++++++-------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/loader.py b/loader.py index 5de7ef9e..be32a601 100644 --- a/loader.py +++ b/loader.py @@ -134,7 +134,7 @@ def prepare_dataset(sentences, word_to_id, char_to_id, lower=False, zeros=False) #Replace all digits with 0 def f(x): x = x.lower() if lower else x - return re.sub('\d', '0', x) if zeros else x + return re.sub(r'\d', '0', x) if zeros else x data = [] for s in sentences: str_words = [w[0] for w in s] diff --git a/run.py b/run.py index e583365d..fb3e09b7 100644 --- a/run.py +++ b/run.py @@ -43,8 +43,8 @@ lower = model.parameters['lower'] zeros = model.parameters['zeros'] -word_to_id = {v:i for i,v in model.id_to_word.items()} -char_to_id = {v:i for i,v in model.id_to_char.items()} +word_to_id = {v:i for i, v in model.id_to_word.items()} +char_to_id = {v:i for i, v in model.id_to_char.items()} while True: if opts.run == 'file': diff --git a/utils.py b/utils.py index 32e299d7..4981bf1b 100644 --- a/utils.py +++ b/utils.py @@ -1,3 +1,4 @@ +from __future__ import print_function import os import re import codecs @@ -83,7 +84,7 @@ def zero_digits(s): """ Replace every digit in a string by a zero. """ - return re.sub('\d', '0', s) + return re.sub(r'\d', '0', s) def iob2(tags): @@ -244,7 +245,7 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences, # Write predictions to disk and run CoNLL script externally eval_id = np.random.randint(1000000, 2000000) - print "eval_id is : ", eval_id + print("eval_id is : ", eval_id) output_path = os.path.join(eval_temp, "eval.%i.output" % eval_id) scores_path = os.path.join(eval_temp, "eval.%i.scores" % eval_id) with codecs.open(output_path, 'w', 'utf8') as f: @@ -254,28 +255,28 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences, # CoNLL evaluation results eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')] for line in eval_lines: - print line + print(line) # Remove temp files # os.remove(output_path) # os.remove(scores_path) # Confusion matrix with accuracy for each tag - print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format( + print("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format( "ID", "NE", "Total", *([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"]) ) - for i in xrange(n_tags): - print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format( + for i in range(n_tags): + print("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format( str(i), id_to_tag[i], str(count[i].sum()), *([count[i][j] for j in xrange(n_tags)] + ["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))]) ) # Global accuracy - print "%i/%i (%.5f%%)" % ( + print("%i/%i (%.5f%%)" % ( count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum()) - ) + )) # F1 on all entities return float(eval_lines[1].strip().split()[-1]) From 922969da134eb8dd86e872a8db027c0795f11202 Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sat, 15 Sep 2018 19:28:49 +0800 Subject: [PATCH 02/10] Initial REST API (built with Flask) --- Dockerfile | 7 ++- app/__init__.py | 68 ++++++++++++++++++++++ app/resources/__init__.py | 0 app/resources/parscit.py | 97 +++++++++++++++++++++++++++++++ app/resources/schemas/__init__.py | 48 +++++++++++++++ app/settings.py | 29 +++++++++ app/utils.py | 12 ++++ requirements/dev.txt | 2 +- requirements/prod.txt | 5 ++ requirements/test.txt | 1 + run_app.py | 12 ++++ 11 files changed, 279 insertions(+), 2 deletions(-) create mode 100644 app/__init__.py create mode 100644 app/resources/__init__.py create mode 100644 app/resources/parscit.py create mode 100644 app/resources/schemas/__init__.py create mode 100644 app/settings.py create mode 100644 app/utils.py create mode 100644 run_app.py diff --git a/Dockerfile b/Dockerfile index bac1b4c2..0a8353a3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,9 @@ FROM python:2 -ENV ENVIRONMENT prod +ENV ENVIRONMENT prod +ENV NUM_WORKERS 1 +ENV WORD_EMB_PATH +ENV MODEL_PATH WORKDIR /usr/src @@ -12,3 +15,5 @@ RUN pip install --no-cache-dir Theano==1.0.2 numpy==1.14.5 gensim==3.5.0 RUN echo "[global]\nfloatX = float32" >> ~/.theanorc RUN echo "[blas]\nldflags = -lblas -lgfortran" >> ~/.theanorc + +EXPOSE 8000 diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 00000000..7fb82d03 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,68 @@ +import os +import logging +from flask import Flask, Blueprint, jsonify, g +from flask_restful_swagger_2 import Api, get_swagger_blueprint +from flask_swagger_ui import get_swaggerui_blueprint +from app.resources.parscit import Parse, ParseBatch +from utils import get_model + + +def create_app(config): + """ + Wrapper function for Flask app + params: + config: Config + """ + app = Flask(__name__) + app.config.from_object(config) + + model_path = os.path.abspath(os.getenv('MODEL_PATH', + default='models/neuralParscit/')) + word_emb_path = os.path.abspath(os.getenv('WORD_EMB_PATH', + default='vectors_with_unk.kv')) + + with app.app_context(): + logging.info("Loading model from {} and using word embeddings from {}".format(model_path, word_emb_path)) + model, inference = get_model(model_path, word_emb_path) + setattr(app, 'model', model) + setattr(app, 'inference', inference) + setattr(app, 'word_to_id', {v:i for i, v in model.id_to_word.items()}) + setattr(app, 'char_to_id', {v:i for i, v in model.id_to_char.items()}) + + API_DOC_PATH = '/docs' + SWAGGER_PATH = '/swagger' + + api_bp = Blueprint('api', __name__) + api = Api(api_bp, add_api_spec_resource=False) + api.add_resource(Parse, '/parscit/parse') + api.add_resource(ParseBatch, '/parscit/parse/batch') + + docs = [api.get_swagger_doc()] + + swagger_ui_blueprint = get_swaggerui_blueprint( + API_DOC_PATH, + SWAGGER_PATH + '.json', + config={ + 'app_name': 'ParsCit API' + } + ) + + app.register_blueprint(api.blueprint) + app.register_blueprint(get_swagger_blueprint(docs, SWAGGER_PATH, + title='ParsCit API', + api_version='1.0', + base_path='/')) + app.register_blueprint(swagger_ui_blueprint, url_prefix=API_DOC_PATH) + + @app.errorhandler(404) + def not_found(error): + """ + Handles URLs that are not specified + """ + return jsonify({ + 'error': { + 'message': error.message + } + }), 404 + + return app diff --git a/app/resources/__init__.py b/app/resources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/resources/parscit.py b/app/resources/parscit.py new file mode 100644 index 00000000..a3ebf56b --- /dev/null +++ b/app/resources/parscit.py @@ -0,0 +1,97 @@ +from __future__ import print_function +import numpy as np +from flask_restful import reqparse +from flask_restful_swagger_2 import swagger, Resource +from flask import current_app, g +from app.resources.schemas import Entity, ParseResponse, ParseBatchResponse +from app.utils import get_model, get_word_to_id, get_char_to_id +from utils import create_input +from loader import prepare_dataset + +import logging + +class Parse(Resource): + """ + """ + parser = reqparse.RequestParser() + parser.add_argument('string', type=unicode, trim=True, required=True, location='json') + @swagger.doc({ + 'description': 'Parse a single string and return the associated entity for each token in the string.', + 'reqparser': { + 'name': 'Single Submission Request', + 'parser': parser + }, + 'responses': { + '200': { + 'description': 'Successfully parsed provided string.', + 'schema': ParseResponse + } + } + }) + + def post(self): + """ + Parse a single string and return the associated entity for each token in the string. + """ + args = self.parser.parse_args() + ref_string = args.get('string') + tokens = ref_string.split(" ") + + data = prepare_dataset([[[token] for token in tokens]], + current_app.word_to_id, + current_app.char_to_id, + current_app.model.parameters['lower'], + True) + + model_inputs = create_input(data[0], current_app.model.parameters, False) + y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1] + tags = [current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred))] + + response = ParseResponse(reference_string=ref_string, + data=[Entity(term=term, entity=entity) + for term, entity in zip(tokens, tags)]) + return response + +class ParseBatch(Resource): + parser = reqparse.RequestParser() + parser.add_argument('strings', type=unicode, action='append', location='json') + @swagger.doc({ + 'description': 'Parse multiple string and return the associated entity for each token in each string.', + 'reqparser': { + 'name': 'Mutliple Submission Request', + 'parser': parser + }, + 'responses': { + '200': { + 'description': 'Successfully parsed provided strings.', + 'schema': ParseBatchResponse + } + } + }) + def post(self): + """ + Parse multiple string and return the associated entity for each token in each string. + """ + args = self.parser.parse_args() + ref_strings = args.get('strings') + + tokens = [[[token] for token in ref_string.split(" ")] for ref_string in ref_strings] + data = prepare_dataset(tokens, + current_app.word_to_id, + current_app.char_to_id, + current_app.model.parameters['lower'], + True) + + tagged = [] + + for index, datum in enumerate(data): + model_inputs = create_input(datum, current_app.model.parameters, False) + y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1] + tags = [current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred))] + + tagged.append([Entity(term=term, entity=entity) + for term, entity in zip(ref_strings[index].split(" "), tags)]) + + response = ParseBatchResponse(reference_strings=ref_strings, + data=tagged) + return response diff --git a/app/resources/schemas/__init__.py b/app/resources/schemas/__init__.py new file mode 100644 index 00000000..586aee2d --- /dev/null +++ b/app/resources/schemas/__init__.py @@ -0,0 +1,48 @@ +from flask_restful_swagger_2 import Schema + +class Entity(Schema): + type = 'object' + properties = { + 'term': { + 'type': 'string' + }, + 'entity': { + 'type': 'string', + 'enum': [ + 'author', + 'booktitle', + 'date', + 'editor', + 'institution', + 'journal', + 'location', + 'note', + 'pages', + 'publisher', + 'tech', + 'title', + 'volume' + ] + } + } + +class ParseResponse(Schema): + type = 'object' + properties = { + 'reference_string': { + 'type': 'string' + }, + 'data': Entity.array() + } + +class ParseBatchResponse(Schema): + type = 'object' + properties = { + 'reference_strings': { + 'type': 'array' + }, + 'data': { + 'type': 'array', + 'items': Entity.array() + } + } diff --git a/app/settings.py b/app/settings.py new file mode 100644 index 00000000..21e345ad --- /dev/null +++ b/app/settings.py @@ -0,0 +1,29 @@ +import os + +class Config(object): + SECRET_KEY = os.getenv('APP_SECRET') + BASE_DIR = os.path.abspath(os.path.dirname(__file__)) + THREADS_PER_PAGE = 2 + MODELS_DIR = os.path.join(BASE_DIR, 'models') + +class ProductionConfig(Config): + """ + Production configuration + """ + ENV = 'prod' + DEBUG = False + +class DevelopmentConfig(Config): + """ + Development configuration + """ + ENV = 'dev' + DEBUG = True + +class TestConfig(Config): + """ + Test configuration + """ + ENV = 'test' + DEBUG = True + TESTING = True diff --git a/app/utils.py b/app/utils.py new file mode 100644 index 00000000..7c311397 --- /dev/null +++ b/app/utils.py @@ -0,0 +1,12 @@ +import os +from flask import current_app, g +from model import Model + +def get_model(model_path, embedding_path): + if 'model' not in g: + g.model = Model(model_path=model_path) + g.model.parameters['pre_emb'] = os.path.join(os.getcwd(), embedding_path) + g.inference = g.model.build(training=False, **g.model.parameters) + g.model.reload() + + return g.model, g.inference diff --git a/requirements/dev.txt b/requirements/dev.txt index 4bee8e96..ada9d44d 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -3,4 +3,4 @@ ipython==5.7.0 git+https://github.com/pytorch/text.git@master torch==0.4.1 -sklearn==0.19.2 +scikit-learn==0.19.2 diff --git a/requirements/prod.txt b/requirements/prod.txt index 52e2cbcf..b9408aef 100644 --- a/requirements/prod.txt +++ b/requirements/prod.txt @@ -1,3 +1,8 @@ gensim==3.5.0 theano==1.0.2 numpy==1.14.5 +Flask==1.0.2 +flask_restful==0.3.6 +flask-restful-swagger-2==0.35 +flask-swagger-ui==3.6.0 +gunicorn==19.9.0 diff --git a/requirements/test.txt b/requirements/test.txt index 15f3a40c..4675f7b7 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,3 +1,4 @@ -r prod.txt pylint==1.9.2 pytest==3.5.1 +pytest-flask==0.10.0 diff --git a/run_app.py b/run_app.py new file mode 100644 index 00000000..d46d4112 --- /dev/null +++ b/run_app.py @@ -0,0 +1,12 @@ +import os +from flask.helpers import get_debug_flag +from app import create_app +from app.settings import ProductionConfig, DevelopmentConfig + +CONFIG = DevelopmentConfig if get_debug_flag() else ProductionConfig + +app = create_app(CONFIG) + +if __name__ == '__main__': + app.run(host=os.getenv('HOST', default='0.0.0.0'), + port=int(os.getenv('PORT', default='5000'))) From fe5fea3c9910798334bd46a2dbdbf251331f8ddf Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sat, 15 Sep 2018 19:38:08 +0800 Subject: [PATCH 03/10] Updated with updated information and parameters --- README.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7aec729f..42b1c89d 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,12 @@ Neural ParsCit is a citation string parser which parses reference strings into i To use the tagger, you need Python 2.7, with Numpy, Theano and Gensim installed. +You can use environmental variables to set the following: +- `MODEL_PATH`: Path to the model's parameters +- `WB_PATH`: Path to the word embeddings +- `TIMEOUT`: Timeout for gunicorn when starting the Flask app. Increase this if you experience the Flask app is unable to start as the model building process takes too long. [Default: 60] +- `NUM_WORKERS`: Number of workers which gunicorn spawns. [Default: 1] + ### Using virtualenv in Linux systems ``` @@ -24,7 +30,7 @@ pip install -r requirements.txt ## Word Embeddings -The word embeddings do not come with this repository. You can obtain the [word embeddings without ``](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) (not recommended for v1.0.3) or [word embeddings with ``](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors_with_unk.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) (deprecated in v1.0.3 as the entire word vectors can be loaded with less memory) from WING website. Please read the next section on availability of `` in word embeddings. +The word embeddings do not come with this repository. You can obtain the [word embeddings with ``](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors_with_unk.tar.gz) from WING website. Please read the next section on availability of `` in word embeddings. You will need to extract the content of the word embedding archive (`vectors_with_unk.tar.gz`) to the root directory for this repository by running `tar xfz vectors_with_unk.tar.gz`. @@ -34,6 +40,18 @@ If the word embeddings provided do not have ``, your instance will not bene Without ``, at most 7.5 GB of memory is required as the entire word vectors need to be instantiated in memory to create the new matrix. Comparing with embeddings with ``, which is much lower as it only requires at most 4.5 GB. +### Web Server + +The web server (a Flask app) provides REST API for + +In order to run the web server, + +`docker run --rm -it -p 8000:8000 -e ENVIRONMENT=dev -e TIMEOUT=60 -v $(pwd):/usr/src --name np theano-gensim:latest /bin/bash` + +In the docker container, `gunicorn -b 0.0.0.0:8000 -w $NUM_WORKERS --timeout $TIMEOUT run_app:app` + +The REST API documentation can be found at `http//localhost:8000/docs` + ## Parse citation strings The fastest way to use the parser is to run state-of-the-art pre-trained model as follows: From 9b5a20d81ac9a431e85a2d98e19cd979e07005d2 Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sat, 15 Sep 2018 19:43:05 +0800 Subject: [PATCH 04/10] Delete old web app --- Predictor.py | 137 -------------------------------------------------- webService.py | 35 ------------- 2 files changed, 172 deletions(-) delete mode 100644 Predictor.py delete mode 100644 webService.py diff --git a/Predictor.py b/Predictor.py deleted file mode 100644 index cc2f39d9..00000000 --- a/Predictor.py +++ /dev/null @@ -1,137 +0,0 @@ -from utils import evaluate, create_input -from model import Model -from loader import augment_with_pretrained, load_sentences -import numpy as np -import itertools -import gensim, re -import json - -class Predictor: - - def __init__(self): - model_path = "final_model" - self.model = Model(model_path=model_path) - self.f = self.model.build(training=False, **self.model.parameters) #needs a better name - self.model.reload() - - self.model.parameters['pre_emb'] = 'vectors.bin' - - pretrained = gensim.models.word2vec.Word2Vec.load_word2vec_format(self.model.parameters['pre_emb'], binary=True) - new_weights = self.model.components['word_layer'].embeddings.get_value() - - n_words = len(self.model.id_to_word) - freq = json.load(open('/home/wenqiang/tagger-master/mydockerbuild2/mydockerbuild/freq', 'r')) - words = [item[0] for item in freq] - - self.model.id_to_word = {} - - for i in xrange((n_words/2), n_words): - word = words[i] - if word in pretrained: - self.model.id_to_word[i-640780] = word - new_weights[i-640780] = pretrained[word] - # c_found += 1 - elif word.lower() in pretrained: - self.model.id_to_word[i-640780] = word.lower() - new_weights[i-640780] = pretrained[word.lower()] - # c_lower += 1 - elif re.sub('\d', '0', word.lower()) in pretrained: - self.model.id_to_word[i-640780] = re.sub('\d', '0', word.lower()) - new_weights[i-640780] = pretrained[ - re.sub('\d', '0', word.lower()) - ] - # c_zeros += 1 - - self.model.id_to_word[0] = '' - self.model.components['word_layer'].embeddings.set_value(new_weights) - - del pretrained - del new_weights - - self.lower = self.model.parameters['lower'] - self.zeros = self.model.parameters['zeros'] - - #Create new mapping because model.id_to_word only is an Ordered dict of only training and testing data - - self.word_to_id = {v:i for i,v in self.model.id_to_word.items()} - self.char_to_id = {v:i for i,v in self.model.id_to_char.items()} - - - def prepare_dataset(self, sentences): - """ - Prepare the dataset. Return a list of lists of dictionaries containing: - - word indexes - - word char indexes - - tag indexes - """ - def f(x): - #if zeros: - return re.sub('\d', '0', x) - #def f(x): return x.lower() if lower else x - data = [] - for s in sentences: - str_words = [w[0] for w in s] - words = [self.word_to_id[f(w) if f(w) in self.word_to_id else ''] - for w in str_words] - # Skip characters that are not in the training set - #for num, word in enumerate(words): - # if word < 0: - # words[num]=word_to_id[''] - chars = [[self.char_to_id[c] for c in w if c in self.char_to_id] - for w in str_words] - caps = [self.cap_feature(w) for w in str_words] - data.append({ - 'str_words': str_words, - 'words': words, - 'chars': chars, - 'caps': caps, - }) - return data - - - def cap_feature(self, s): - """ - Capitalization feature: - 0 = low caps - 1 = all caps - 2 = first letter caps - 3 = one capital (not first letter) - """ - if s.lower() == s: - return 0 - elif s.upper() == s: - return 1 - elif s[0].upper() == s[0]: - return 2 - else: - return 3 - - #synchronized block: TO DO - def parseString(self, string): - #TO DO - #To be consumed by web-service - test_file = "test_file" - file = open(test_file, 'w') - file.write('\n'.join(string.encode('utf-8').split())) - file.close() - test_sentences = load_sentences(test_file, self.lower, self.zeros) - data = self.prepare_dataset(test_sentences) - result = '' - for citation in data: - input = create_input(citation, self.model.parameters, False) - y_pred = np.array(self.f[1](*input))[1:-1] - tags = [] - for i in xrange(len(y_pred)): - tags.append(self.model.id_to_tag[y_pred[i]]) - for num, word in enumerate(string.encode('utf-8').split()): - #print word.decode('utf-8')+'\t'+tags[num] - result += word.decode('utf-8')+'\t'+tags[num]+'\n' - return result - -if __name__ == '__main__': - p = Predictor() - - while True: - string = raw_input("Enter the citation string: ").decode('utf-8') - r = p.parseString(string) - print(str(r)) \ No newline at end of file diff --git a/webService.py b/webService.py deleted file mode 100644 index 370c6919..00000000 --- a/webService.py +++ /dev/null @@ -1,35 +0,0 @@ -#This shows a demo of how to use Neural ParsCit as web service for -#enterprise systems. It's encouraged to used as service since it uses -#a lot of memory due to word embeddings -from flask import Flask, request -from flask_restful import Resource, Api -from sqlalchemy import create_engine -from json import dumps -from flask.ext.jsonpify import jsonify -from time import gmtime, strftime -from Predictor import Predictor - -app = Flask(__name__) -api = Api(app) -_predictor = Predictor() - -class Welcome(Resource): - def get(self): - d = dict() - d['status'] = 'It\'s Working!' - result = {'result': [d]} - return jsonify(result) - -class Parscit(Resource): - def get(self): - d = dict() - d['input_string'] = request.args.get('text') - d['parsed_string'] = _predictor.parseString(d['input_string']) - result = {'result': [d]} - return jsonify(result) - -api.add_resource(Parscit, '/parscit') -api.add_resource(Welcome, '/status') - -if __name__ == '__main__': - app.run(host='0.0.0.0',port='5002') From 10b14adc081a2fca77a7ab40e93553350c793a2b Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sat, 15 Sep 2018 19:47:45 +0800 Subject: [PATCH 05/10] Moved web server section --- README.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 42b1c89d..1013ff85 100644 --- a/README.md +++ b/README.md @@ -40,20 +40,11 @@ If the word embeddings provided do not have ``, your instance will not bene Without ``, at most 7.5 GB of memory is required as the entire word vectors need to be instantiated in memory to create the new matrix. Comparing with embeddings with ``, which is much lower as it only requires at most 4.5 GB. -### Web Server - -The web server (a Flask app) provides REST API for - -In order to run the web server, - -`docker run --rm -it -p 8000:8000 -e ENVIRONMENT=dev -e TIMEOUT=60 -v $(pwd):/usr/src --name np theano-gensim:latest /bin/bash` - -In the docker container, `gunicorn -b 0.0.0.0:8000 -w $NUM_WORKERS --timeout $TIMEOUT run_app:app` - -The REST API documentation can be found at `http//localhost:8000/docs` ## Parse citation strings +### Command Line + The fastest way to use the parser is to run state-of-the-art pre-trained model as follows: ``` @@ -64,6 +55,17 @@ The script can run interactively or input can be passed in a file. In the intera The state-of-the-art trained model is provided in the models folder and is named neuralParsCit. The binary file for word embeddings is provided in the docker image of the current version of neural ParsCit. The hyper parameter ```discarded``` is the number of embeddings not used in our model. Retained words have a frequency of more than 0 in the ACM citation literature from 1994-2014. +### Using a Web Server + +The web server (a Flask app) provides REST API. + +Running the web server, +`docker run --rm -it -p 8000:8000 -e TIMEOUT=60 -v $(pwd):/usr/src --name np theano-gensim:latest /bin/bash` + +In the container, `gunicorn -b 0.0.0.0:8000 -w $NUM_WORKERS --timeout $TIMEOUT run_app:app` + +The REST API documentation can be found at `http//localhost:8000/docs` + ## Train a model From a9924a0e66840d3e50fc107d94d109bce926b125 Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sat, 15 Sep 2018 19:50:32 +0800 Subject: [PATCH 06/10] Update command the install dependencies --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1013ff85..e951d40f 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,11 @@ You can use environmental variables to set the following: ``` virtualenv -ppython2.7 .venv source .venv/bin/activate -pip install -r requirements.txt +pip install -r requirements/.txt ``` +Where `` is `{prod, dev, test}` + ### Using Docker 1. Build the image: `docker build -t theano-gensim - < Dockerfile` From 66a5e6032ee002c67a4f433425df855ba0834f2f Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sun, 16 Sep 2018 14:02:24 +0800 Subject: [PATCH 07/10] Added tests for APIs --- app/resources/parscit.py | 4 ++-- tests/conftest.py | 8 ++++++++ tests/test_api.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_swagger.py | 10 ++++++++++ 4 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/test_api.py create mode 100644 tests/test_swagger.py diff --git a/app/resources/parscit.py b/app/resources/parscit.py index a3ebf56b..8370fab2 100644 --- a/app/resources/parscit.py +++ b/app/resources/parscit.py @@ -4,7 +4,7 @@ from flask_restful_swagger_2 import swagger, Resource from flask import current_app, g from app.resources.schemas import Entity, ParseResponse, ParseBatchResponse -from app.utils import get_model, get_word_to_id, get_char_to_id +from app.utils import get_model from utils import create_input from loader import prepare_dataset @@ -54,7 +54,7 @@ def post(self): class ParseBatch(Resource): parser = reqparse.RequestParser() - parser.add_argument('strings', type=unicode, action='append', location='json') + parser.add_argument('strings', type=unicode, action='append', required=True, location='json') @swagger.doc({ 'description': 'Parse multiple string and return the associated entity for each token in each string.', 'reqparser': { diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..d8bf5e6c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,8 @@ +import pytest +from app import create_app +from app.settings import TestConfig + +@pytest.fixture +def app(): + _app = create_app(TestConfig) + return _app diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 00000000..7a20c4c8 --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,35 @@ +import os +import pytest + +@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") +def test_parse(client): + data = { + 'string': 'Animesh Prasad, Manpreet Kaur and Min-Yen Kan (2018) Neural ParsCit: a deep learning-based reference string parser. International Journal on Digital Libraries. May 2018.' + } + + assert client.post('/parscit/parse', json=data).status_code == 200 + +# @pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") +# def test_parse_no_content(client): +# data = { +# 'string': "" +# } +# +# assert client.post('/parscit/parse', json=data).status_code == 400 + +@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") +def test_parse_batch(client): + data = { + 'strings': ['Animesh Prasad, Manpreet Kaur and Min-Yen Kan (2018) Neural ParsCit: a deep learning-based reference string parser. International Journal on Digital Libraries. May 2018.', + 'Juyoung An, Namhee Kim, Min-Yen Kan, Muthu Kumar Chandrasekaran and Min Song (2017) Exploring characteristics of highly cited authors according to citation location and content. Journal of the Association for Information Science and Technology. Volume 68, Issue 8 (August). pp. 1975-1988.'] + } + + assert client.post('/parscit/parse/batch', json=data).status_code == 200 + +@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") +def test_parse_batch_no_content(client): + data = { + 'strings': [] + } + + assert client.post('/parscit/parse/batch', json=data).status_code == 400 diff --git a/tests/test_swagger.py b/tests/test_swagger.py new file mode 100644 index 00000000..26a0f54e --- /dev/null +++ b/tests/test_swagger.py @@ -0,0 +1,10 @@ +import os +import pytest + +@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") +def test_swagger(client): + assert client.get('/swagger.json').status_code == 200 + +@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") +def test_api_documentation(client): + assert client.get('/docs').status_code == 301 From e71c058d140f594dcd076e2c5eb9c480376ee997 Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sun, 16 Sep 2018 14:03:36 +0800 Subject: [PATCH 08/10] Decrease model's deviation threshold --- tests/models/test_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_inference.py b/tests/models/test_inference.py index 4f715bf9..b280aeff 100644 --- a/tests/models/test_inference.py +++ b/tests/models/test_inference.py @@ -79,5 +79,5 @@ def test_inference_performance(): } data_file.close() - - assert eval_metrics == pytest.approx({'macro_f1': 0.98, 'micro_f1': 0.99}, abs=0.01) + + assert eval_metrics == pytest.approx({'macro_f1': 0.984, 'micro_f1': 0.993}, abs=0.001) From b446a47c2761040615cf3a7ae839a8f90405c812 Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sun, 16 Sep 2018 15:14:18 +0800 Subject: [PATCH 09/10] Handle empty string --- Dockerfile | 2 -- README.md | 2 +- app/__init__.py | 4 +--- app/resources/parscit.py | 8 +++++--- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0a8353a3..f0aa096f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,6 @@ FROM python:2 ENV ENVIRONMENT prod ENV NUM_WORKERS 1 -ENV WORD_EMB_PATH -ENV MODEL_PATH WORKDIR /usr/src diff --git a/README.md b/README.md index e951d40f..6f51a0fd 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Where `` is `{prod, dev, test}` ### Using Docker 1. Build the image: `docker build -t theano-gensim - < Dockerfile` -1. Run the repo mounted to the container: `docker run -it -v /path/to/Neural-ParsCit:/usr/src --name np theano-gensim:latest /bin/bash` +1. Run the repo mounted to the container: `docker run -it -v $(pwd):/usr/src --name np theano-gensim:latest /bin/bash` ## Word Embeddings diff --git a/app/__init__.py b/app/__init__.py index 7fb82d03..ee4f0f3f 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -60,9 +60,7 @@ def not_found(error): Handles URLs that are not specified """ return jsonify({ - 'error': { - 'message': error.message - } + 'message': "API doesn't exist" }), 404 return app diff --git a/app/resources/parscit.py b/app/resources/parscit.py index 8370fab2..8d0c9c6a 100644 --- a/app/resources/parscit.py +++ b/app/resources/parscit.py @@ -1,15 +1,13 @@ from __future__ import print_function import numpy as np +from flask import abort, current_app, g from flask_restful import reqparse from flask_restful_swagger_2 import swagger, Resource -from flask import current_app, g from app.resources.schemas import Entity, ParseResponse, ParseBatchResponse from app.utils import get_model from utils import create_input from loader import prepare_dataset -import logging - class Parse(Resource): """ """ @@ -35,6 +33,10 @@ def post(self): """ args = self.parser.parse_args() ref_string = args.get('string') + if ref_string is None or ref_string == "": + # Hackish way as reqparse can't catch empty string + abort(400, description='string is empty or not provided.') + tokens = ref_string.split(" ") data = prepare_dataset([[[token] for token in tokens]], From 51e895790bb4bd70c178961fb0fdacca6dd02f07 Mon Sep 17 00:00:00 2001 From: Yuan Chuan Kee Date: Sun, 16 Sep 2018 15:19:44 +0800 Subject: [PATCH 10/10] Restore test for empty string --- tests/test_api.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 7a20c4c8..2927bfbb 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -9,13 +9,13 @@ def test_parse(client): assert client.post('/parscit/parse', json=data).status_code == 200 -# @pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") -# def test_parse_no_content(client): -# data = { -# 'string': "" -# } -# -# assert client.post('/parscit/parse', json=data).status_code == 400 +@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") +def test_parse_no_content(client): + data = { + 'string': "" + } + + assert client.post('/parscit/parse', json=data).status_code == 400 @pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI") def test_parse_batch(client):