From 30ebe9ab3cef47710ce43294519d1b0de67b9199 Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Mon, 23 Sep 2024 08:36:16 +0100 Subject: [PATCH 1/6] CU-8695x1dy9: runs model in background process, run process_tasks in separate docker compose service, dialog workflow to inspect running state of bg_process and API to cancel running process and directly go in to annotate a project. --- docker-compose-dev.yml | 20 +++ docker-compose-prod.yml | 14 ++ docker-compose.yml | 21 ++- envs/env | 2 + envs/env-prod | 2 + webapp/Dockerfile | 1 + webapp/api/api/model_cache.py | 14 ++ webapp/api/api/utils.py | 7 +- webapp/api/api/views.py | 61 ++++++-- webapp/api/core/settings.py | 7 +- webapp/api/core/urls.py | 2 + .../src/components/common/DocumentSummary.vue | 16 -- .../src/components/common/ProjectList.vue | 146 ++++++++++++++++-- webapp/frontend/src/views/Home.vue | 2 +- .../frontend/src/views/TrainAnnotations.vue | 58 ++++--- webapp/scripts/run-bg-process.sh | 13 ++ webapp/scripts/run.sh | 2 - 17 files changed, 299 insertions(+), 89 deletions(-) create mode 100755 webapp/scripts/run-bg-process.sh diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index cad8d993..801d3a0c 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -7,6 +7,7 @@ services: context: ./webapp args: SPACY_MODELS: ${SPACY_MODELS:-en_core_web_md} + image: medcattrainer-api restart: always volumes: - ./webapp/api/core:/home/api/core @@ -23,6 +24,25 @@ services: - MCT_VERSION=latest command: /home/scripts/run.sh + # bg process task runner + medcattrainer-bg-process: + image: medcattrainer-api + depends_on: + - medcattrainer + restart: always + volumes: + - ./webapp/api/core:/home/api/core + - ./webapp/api/api:/home/api/api + - ./webapp/scripts/run-bg-process.sh:/home/scripts/run-bg-process.sh + - ./configs:/home/configs + - api-media:/home/api/media + - api-static:/home/api/static + - api-db:/home/api/db + env_file: + - ./envs/env + command: /home/scripts/run-bg-process.sh + + nginx: image: nginx restart: always diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index 535c5cdc..057659f9 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -20,6 +20,20 @@ services: - MCT_VERSION=v2.17.1 command: /home/scripts/run.sh + # bg process task runner + medcattrainer-bg-process: + image: cogstacksystems/medcat-trainer:v2.17.1 + restart: always + volumes: + - ./configs:/home/configs + - api-media:/home/api/media + - api-static:/home/api/static + - api-db:/home/api/db + - api-db-backup:/home/api/db-backup + env_file: + - ./envs/env + command: /home/scripts/run-bg-process.sh + # crontab - for db backup medcattrainer-db-backup: image: cogstacksystems/medcat-trainer:v2.17.1 diff --git a/docker-compose.yml b/docker-compose.yml index 4f226645..7835e7fd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,8 +2,9 @@ # projects are not used. services: + # api server medcattrainer: - image: cogstacksystems/medcat-trainer:v2.17.1 + image: cogstacksystems/medcat-trainer:v2.17.3 restart: always volumes: - ./configs:/home/configs @@ -14,12 +15,26 @@ services: env_file: - ./envs/env environment: - - MCT_VERSION=v2.17.1 + - MCT_VERSION=v2.17.3 command: /home/scripts/run.sh + # bg process task runner + medcattrainer-bg-process: + image: cogstacksystems/medcat-trainer:v2.17.3 + restart: always + volumes: + - ./configs:/home/configs + - api-media:/home/api/media + - api-static:/home/api/static + - api-db:/home/api/db + - api-db-backup:/home/api/db-backup + env_file: + - ./envs/env + command: /home/scripts/run-bg-process.sh + # crontab - for db backup medcattrainer-db-backup: - image: cogstacksystems/medcat-trainer:v2.17.1 + image: cogstacksystems/medcat-trainer:v2.17.3 restart: always volumes: - ./configs:/home/configs diff --git a/envs/env b/envs/env index 3d58bcb5..a5240589 100644 --- a/envs/env +++ b/envs/env @@ -3,6 +3,8 @@ OPENBLAS_NUM_THREADS=1 ### MedCAT cfg ### MEDCAT_CONFIG_FILE=/home/configs/base.txt +# number of MedCAT models that can be cached, run in bg processes at any one time +MAX_MEDCAT_MODELS=2 ### Deployment Realm ### ENV=non-prod diff --git a/envs/env-prod b/envs/env-prod index 0728c195..ec360d58 100644 --- a/envs/env-prod +++ b/envs/env-prod @@ -3,6 +3,8 @@ OPENBLAS_NUM_THREADS=1 ### MedCAT cfg ### MEDCAT_CONFIG_FILE=/home/configs/base.txt +# number of MedCAT models that can be cached, run in bg processes at any one time +MAX_MEDCAT_MODELS=2 ENV=prod # SECRET KEY - edit this for prod deployments, diff --git a/webapp/Dockerfile b/webapp/Dockerfile index f0c0ff09..f0e08e1a 100644 --- a/webapp/Dockerfile +++ b/webapp/Dockerfile @@ -37,3 +37,4 @@ RUN for SPACY_MODEL in ${SPACY_MODELS}; do python -m spacy download ${SPACY_MODE WORKDIR /home/api/ RUN chmod a+x /home/scripts/run.sh +RUN chmod a+x /home/scripts/run-bg-process.sh diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index 304ccf7c..68c41add 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -17,10 +17,22 @@ VOCAB_MAP = {} CAT_MAP = {} +_MAX_MODELS_LOADED = os.getenv("MAX_MEDCAT_MODELS", 1) logger = logging.getLogger(__name__) +def _clear_models(cdb_map: Dict[str, CDB]=CDB_MAP, + vocab_map: Dict[str, Vocab]=VOCAB_MAP, + cat_map: Dict[str, CAT]=CAT_MAP): + if len(cat_map) == _MAX_MODELS_LOADED: + (k := next(iter(cat_map)), cat_map.pop(k)) + if len(cdb_map) == _MAX_MODELS_LOADED: + (k := next(iter(cdb_map)), cdb_map.pop(k)) + if len(vocab_map) == _MAX_MODELS_LOADED: + (k := next(iter(vocab_map)), vocab_map.pop(k)) + + def get_medcat_from_cdb_vocab(project, cdb_map: Dict[str, CDB]=CDB_MAP, vocab_map: Dict[str, Vocab]=VOCAB_MAP, @@ -61,6 +73,7 @@ def get_medcat_from_cdb_vocab(project, vocab_map[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) cat_map[cat_id] = cat + _clear_models(cat_map=cat_map, cdb_map=cdb_map, vocab_map=vocab_map) return cat @@ -70,6 +83,7 @@ def get_medcat_from_model_pack(project, cat_map: Dict[str, CAT]=CAT_MAP) -> CAT: logger.info('Loading model pack from:%s', model_pack_obj.model_pack.path) cat = CAT.load_model_pack(model_pack_obj.model_pack.path) cat_map[cat_id] = cat + _clear_models(cat_map=cat_map) return cat diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index 41039dfe..1570438c 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -240,17 +240,16 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int): project = ProjectAnnotateEntities.objects.get(id=project_id) docs = Document.objects.filter(id__in=doc_ids) - logger.info('Loading CAT object in bg process') + logger.info('Loading CAT object in bg process for project: %s', project.id) cat = get_medcat(project=project) # Set CAT filters cat.config.linking['filters']['cuis'] = project.cuis for doc in docs: - logger.info(f'Running MedCAT model over doc: {doc.id}') + logger.info(f'Running MedCAT model for project {project.id}:{project.name} over doc: {doc.id}') spacy_doc = cat(doc.text) anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project) - add_annotations(spacy_doc=spacy_doc, user=user, project=project, @@ -260,6 +259,8 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int): # add doc to prepared_documents project.prepared_documents.add(doc) project.save() + logger.info('Prepared all docs for project: %s, docs processed: %s', + project.id, project.prepared_documents) @receiver(post_save, sender=ProjectAnnotateEntities) diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py index 19102ad5..71ef2dfc 100644 --- a/webapp/api/api/views.py +++ b/webapp/api/api/views.py @@ -4,6 +4,7 @@ from background_task.models import Task, CompletedTask from django.contrib.auth.views import PasswordResetView +from django.core.exceptions import ObjectDoesNotExist from django.http import HttpResponseBadRequest, HttpResponseServerError, HttpResponse from django.shortcuts import render from django.utils import timezone @@ -245,11 +246,6 @@ def prepare_documents(request): 'but is still set on the project. To fix remove and reset the ' 'cui filter file' % project.cuis_file}, status=500) - if request.data.get('bg_task'): - # execute model infer in bg - job = prep_docs(p_id, d_ids, user.id) - return Response({'bg_job_id': job.id}) - try: for d_id in d_ids: document = Document.objects.get(id=d_id) @@ -294,24 +290,59 @@ def prepare_documents(request): return Response({'message': 'Documents prepared successfully'}) +@api_view(http_method_names=['POST']) +def prepare_documents_bg(request): + user = request.user + # Get project id + p_id = request.data['project_id'] + project = ProjectAnnotateEntities.objects.get(id=p_id) + docs = Document.objects.filter(dataset=project.dataset) + + # Get docs that have no AnnotatedEntities + d_ids = [d.id for d in docs if len(AnnotatedEntity.objects.filter(document=d).filter(project=project)) == 0 or + d in project.validated_documents.all()] + + # execute model infer in bg + job = prep_docs(p_id, d_ids, user.id) + return Response({'bg_job_id': job.id}) + + @api_view(http_method_names=['GET']) -def prepare_docs_bg_tasks(request): - proj_id = int(request.GET['project']) +def prepare_docs_bg_tasks(_): running_doc_prep_tasks = Task.objects.filter(queue='doc_prep') completed_doc_prep_tasks = CompletedTask.objects.filter(queue='doc_prep') def transform_task_params(task_params_str): task_params = json.loads(task_params_str)[0] return { - 'document': task_params[1][0], + 'project': task_params[0], 'user_id': task_params[2] } - running_tasks = [transform_task_params(task.task_params) for task in running_doc_prep_tasks - if json.loads(task.task_params)[0][0] == proj_id] - complete_tasks = [transform_task_params(task.task_params) for task in completed_doc_prep_tasks - if json.loads(task.task_params)[0][0] == proj_id] + running_tasks = [transform_task_params(task.task_params) for task in running_doc_prep_tasks] + complete_tasks = [transform_task_params(task.task_params) for task in completed_doc_prep_tasks] return Response({'running_tasks': running_tasks, 'comp_tasks': complete_tasks}) + +@api_view(http_method_names=['GET', 'DELETE']) +def prepare_docs_bg_task(request, proj_id): + if request.method == 'GET': + # state of bg running process as determined by prepared docs + try: + proj = ProjectAnnotateEntities.objects.get(id=proj_id) + prepd_docs_count = proj.prepared_documents.count() + ds_total_count = Document.objects.filter(dataset=ProjectAnnotateEntities.objects.get(id=proj_id).dataset.id).count() + return Response({'proj_id': proj_id, 'dataset_len': ds_total_count, 'prepd_docs_len': prepd_docs_count}) + except ObjectDoesNotExist: + return HttpResponseBadRequest('No Project found for ID: %s', proj_id) + else: + running_doc_prep_tasks = {json.loads(task.task_params)[0][0]: task.id + for task in Task.objects.filter(queue='doc_prep')} + if proj_id in running_doc_prep_tasks: + Task.objects.filter(id=running_doc_prep_tasks[proj_id]).delete() + return Response("Successfully stopped running response") + else: + return HttpResponseBadRequest('Could not find running BG Process to stop') + @api_view(http_method_names=['POST']) def add_annotation(request): # Get project id @@ -620,7 +651,11 @@ def version(_): def concept_search_index_available(request): cdb_ids = request.GET.get('cdbs', '').split(',') cdb_ids = [c for c in cdb_ids if len(c)] - return collections_available(cdb_ids) + try: + return collections_available(cdb_ids) + except Exception as e: + return HttpResponseServerError("Solr Search Service not available check the service is up, running " + "and configured correctly. %s", e) @api_view(http_method_names=['GET']) diff --git a/webapp/api/core/settings.py b/webapp/api/core/settings.py index 403d6661..a86de0e7 100644 --- a/webapp/api/core/settings.py +++ b/webapp/api/core/settings.py @@ -20,9 +20,10 @@ # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -environ_origins = os.environ.get('CSRF_TRUSTED_ORIGINS', None) -trusted_origins = [] if environ_origins is None else environ_origins.split(',') -CSRF_TRUSTED_ORIGINS = ['https://127.0.0.1:8001', 'http://localhost:8001'] + trusted_origins +environ_origins = os.environ.get('CSRF_TRUSTED_ORIGINS', '') +trusted_origins = [origin.strip() for origin in environ_origins.split(',') if origin.strip()] + +CSRF_TRUSTED_ORIGINS = ['http://127.0.0.1:8001', 'http://localhost:8001'] + trusted_origins SECURE_CROSS_ORIGIN_OPENER_POLICY = None diff --git a/webapp/api/core/urls.py b/webapp/api/core/urls.py index 4b12c3e6..fa351914 100644 --- a/webapp/api/core/urls.py +++ b/webapp/api/core/urls.py @@ -27,7 +27,9 @@ path('api/anno-conf/', api.views.get_anno_tool_conf), path('api/search-concepts/', api.views.search_solr), path('api/prepare-documents/', api.views.prepare_documents), + path('api/prepare-documents-bg/', api.views.prepare_documents_bg), path('api/prep-docs-bg-tasks/', api.views.prepare_docs_bg_tasks), + path('api/prep-docs-bg-tasks//', api.views.prepare_docs_bg_task), path('api/api-token-auth/', auth_views.obtain_auth_token), path('admin/', admin.site.urls), path('api/api-auth/', include('rest_framework.urls', namespace='rest_framework')), diff --git a/webapp/frontend/src/components/common/DocumentSummary.vue b/webapp/frontend/src/components/common/DocumentSummary.vue index 7a6897b8..4f22087c 100644 --- a/webapp/frontend/src/components/common/DocumentSummary.vue +++ b/webapp/frontend/src/components/common/DocumentSummary.vue @@ -67,23 +67,7 @@ export default { runningBgTasks: [] } }, - created() { - // this.pollDocPrepStatus(true) - }, methods: { - pollDocPrepStatus (pollInfinite) { - if (this.projId) { - this.$http.get(`/api/prep-docs-bg-tasks/?project=${this.projId}`).then(resp => { - this.runningBgTasks = resp.data.running_tasks.map(d => d.document) - this.completeBgTasks = resp.data.comp_tasks.map(d => d.document) - }) - if (pollInfinite) { - setTimeout(this.pollDocPrepStatus, 5000) - } - } else { - setTimeout(this.pollDocPrepStatus, 5000) - } - }, scrollSelectedDocId () { const el = document.getElementsByClassName('selected-doc') if (el.length > 0) { diff --git a/webapp/frontend/src/components/common/ProjectList.vue b/webapp/frontend/src/components/common/ProjectList.vue index 8d2d0ae4..44f85987 100644 --- a/webapp/frontend/src/components/common/ProjectList.vue +++ b/webapp/frontend/src/components/common/ProjectList.vue @@ -114,11 +114,20 @@ + @@ -134,6 +143,7 @@ + @@ -164,7 +174,35 @@
- +
+ + + +
+

Background Model Predictions

+
+
+ + + {{ cancelRunningBgTaskModal.prepCount }} / {{ cancelRunningBgTaskModal.dsCount }} + + +
+ Confirm to stop running model predictions in the background and enter project. +
+
+ Model predictions ready. +
+
+
+
@@ -198,6 +236,7 @@ export default { { key: 'progress', label: 'Progress', formatter: this.progressFormatter }, { key: 'anno_class', label: 'Annotation Classification', sortable: true }, { key: 'cdb_search_filter', label: 'Concepts Imported' }, + { key: 'run_model', label: 'Run Model' }, { key: 'model_loaded', label: 'Model Loaded' }, { key: 'metrics', label: 'Metrics' }, { key: 'save_model', label: 'Save Model' } @@ -205,6 +244,7 @@ export default { adminOnlyFields: [ 'anno_class', 'cdb_search_filter', + 'run_model', 'model_loaded', 'save_model' ] @@ -212,15 +252,22 @@ export default { projectLockedWarning: false, modelSaved: false, modelSavedError: false, + runModelBgError: false, loadingModel: false, modelCacheLoadError: false, metricsJobId: null, saving: false, clearModelModal: false, selectedProjects: [], - loadingProjects: false + loadingProjects: false, + runningBgTasks: new Set(), + completeBgTasks: new Set(), + cancelRunningBgTaskModal: null } }, + created () { + this.pollDocPrepStatus() + }, methods: { clearLoadedModel (cdbId) { this.clearModelModal = cdbId @@ -264,23 +311,41 @@ export default { }, 15000) }) }, - select (projects) { + // somehow projects is empty... let project = projects[0] - if (!project.project_locked) { - this.$router.push({ - name: 'train-annotations', - params: { - projectId: project.id - } - }) - } else { - this.projectLockedWarning = true + if (project) { + if (project.project_locked) { + this.projectLockedWarning = true + const that = this + setTimeout(() => { + that.projectLockedWarning = false + }, 5000) + } else if (this.runningBgTasks.has(project.id)) { + this.bgTaskStatus(project) + } else { + this.$router.push({ + name: 'train-annotations', + params: { + projectId: project.id + } + }) + } + } + }, + runModel (projectId) { + let payload = { + project_id: projectId + } + this.runningBgTasks = new Set([...this.runningBgTasks, projectId]) + this.$http.post('/api/prepare-documents-bg/', payload).then(_ => { + }).catch(_ => { + this.runModelBgError = true const that = this - setTimeout(() => { - that.projectLockedWarning = false + setTimeout(function () { + that.runModelBgError = false }, 5000) - } + }) }, saveModel (projectId) { let payload = { @@ -303,6 +368,40 @@ export default { }, 5000) }) }, + bgTaskStatus (project) { + this.$http.get(`/api/prep-docs-bg-tasks/${project.id}/`).then(resp => { + this.cancelRunningBgTaskModal = { + proj: project, + dsCount: resp.data.dataset_len, + prepCount: resp.data.prepd_docs_len + } + setTimeout(() => { + if (this.cancelRunningBgTaskModal) { + this.bgTaskStatus(project) + } + }, 5000) + }) + }, + confirmCancelBgTaskStop () { + let project = this.cancelRunningBgTaskModal.proj + this.$http.delete(`/api/prep-docs-bg-tasks/${project.id}/`).then(_ => { + this.runningBgTasks.delete(project.id) + }).catch(exc => { + console.warn(exc) + }).finally(_ => { + this.select([project]) + this.cancelRunningBgTaskModal = null + }) + }, + pollDocPrepStatus () { + this.$http.get('/api/prep-docs-bg-tasks/').then(resp => { + this.completeBgTasks = new Set(resp.data.comp_tasks.map(d => d.project)) + this.runningBgTasks = new Set([...this.runningBgTasks, + ...resp.data.running_tasks.map(d => d.project)]).difference(this.completeBgTasks) + + }) + setTimeout(this.pollDocPrepStatus, 8000) + }, progressFormatter (value, key, item) { let txtColorClass = 'good-perf' if (item['percent_progress'] < 80) { @@ -425,5 +524,20 @@ export default { text-overflow: ellipsis; } +.run-model { + position: relative; +} + +.model-bg-run-comp { + color: $success; + font-size: 15px; + position: absolute; + right: -5px; + top: -5px; +} + +.cancel-dialog-body { + padding-top: 10px; +} diff --git a/webapp/frontend/src/views/Home.vue b/webapp/frontend/src/views/Home.vue index 5aef66bf..2bf0374f 100644 --- a/webapp/frontend/src/views/Home.vue +++ b/webapp/frontend/src/views/Home.vue @@ -175,7 +175,7 @@ export default { this.selectedProjectGroup = projectGroups[0] this.selectedProjectGroup.items = this.projects.items.filter(p => p.group === this.selectedProjectGroup.id) } - }, + } } } diff --git a/webapp/frontend/src/views/TrainAnnotations.vue b/webapp/frontend/src/views/TrainAnnotations.vue index 4a089e82..dbbba9a2 100644 --- a/webapp/frontend/src/views/TrainAnnotations.vue +++ b/webapp/frontend/src/views/TrainAnnotations.vue @@ -443,42 +443,36 @@ export default { this.currentEnt = null this.prepareDoc() }, - prepareDocBg (doc) { - let payload = { - project_id: this.project.id, - document_ids: [doc.id], - bg_task: true - } - this.$http.post('/api/prepare-documents/', payload) - }, prepareDoc () { - this.loadingMsg = "Loading MedCAT model..." - this.$http.get(`/api/cache-model/${this.project.concept_db}/`).then(_ => { - this.loadingMsg = "Preparing Document..." - let payload = { - project_id: this.project.id, - document_ids: [this.currentDoc.id] - } - this.$http.post('/api/prepare-documents/', payload).then(_ => { - // assuming a 200 is fine here. - if (!this.project.prepared_documents.includes(this.currentDoc.id)) { - this.project.prepared_documents.push(this.currentDoc.id) + if (this.project.prepared_documents.includes(this.currentDoc.id)) { + this.fetchEntities() + } else { + this.loadingMsg = "Loading MedCAT model..." + this.$http.get(`/api/cache-model/${this.project.concept_db}/`).then(_ => { + this.loadingMsg = "Preparing Document..." + let payload = { + project_id: this.project.id, + document_ids: [this.currentDoc.id] } - - this.fetchEntities() - }).catch(err => { + this.$http.post('/api/prepare-documents/', payload).then(_ => { + // assuming a 200 is fine here. + if (!this.project.prepared_documents.includes(this.currentDoc.id)) { + this.project.prepared_documents.push(this.currentDoc.id) + } + this.fetchEntities() + }).catch(err => { + this.errors.modal = true + if (err.response) { + this.errors.message = err.response.data.message || 'Internal Server Error.' + this.errors.description = err.response.data.description || '' + this.errors.stacktrace = err.response.data.stacktrace + } + }) + }).catch(_ => { this.errors.modal = true - if (err.response) { - this.errors.message = err.response.data.message || 'Internal Server Error.' - this.errors.description = err.response.data.description || '' - this.errors.stacktrace = err.response.data.stacktrace - } + this.errors.mesasge = "Internal server error - cannot load MedCAT model. Contact your MedCAT admin quoting this project ID" }) - }).catch(_ => { - this.errors.modal = true - this.errors.mesasge = "Internal server error - cannot load MedCAT model. Contact your MedCAT admin quoting this project ID" - }) - + } }, fetchEntities (selectedEntId) { let params = this.nextEntSetUrl === null ? `?project=${this.projectId}&document=${this.currentDoc.id}` diff --git a/webapp/scripts/run-bg-process.sh b/webapp/scripts/run-bg-process.sh new file mode 100755 index 00000000..b54d535f --- /dev/null +++ b/webapp/scripts/run-bg-process.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +# env vars that should only be on for app running... +export RESUBMIT_ALL_ON_STARTUP=0 + +# Collect static files and migrate if needed +python /home/api/manage.py collectstatic --noinput +python /home/api/manage.py makemigrations --noinput +python /home/api/manage.py makemigrations api --noinput +python /home/api/manage.py migrate --noinput +python /home/api/manage.py migrate api --noinput + +python /home/api/manage.py process_tasks --log-std diff --git a/webapp/scripts/run.sh b/webapp/scripts/run.sh index 3f2285e9..e06ca268 100755 --- a/webapp/scripts/run.sh +++ b/webapp/scripts/run.sh @@ -14,8 +14,6 @@ python /home/api/manage.py makemigrations api --noinput python /home/api/manage.py migrate --noinput python /home/api/manage.py migrate api --noinput -python /home/api/manage.py process_tasks --log-std & - # create a new super user, with username and password 'admin' # also create a user group `user_group` that prevents users from deleting models echo "from django.contrib.auth import get_user_model From 5d6a3fd56dfd1fe1c27c5058d9a81762dec35eb1 Mon Sep 17 00:00:00 2001 From: Tom Searle Date: Mon, 23 Sep 2024 08:36:16 +0100 Subject: [PATCH 2/6] CU-8695x1dy9: runs model in background process, run process_tasks in separate docker compose service, dialog workflow to inspect running state of bg_process and API to cancel running process and directly go in to annotate a project. --- docker-compose-dev.yml | 20 +++ docker-compose-prod.yml | 14 ++ docker-compose.yml | 15 ++ envs/env | 2 + envs/env-prod | 2 + webapp/Dockerfile | 1 + webapp/api/api/model_cache.py | 14 ++ webapp/api/api/utils.py | 7 +- webapp/api/api/views.py | 61 ++++++-- webapp/api/core/settings.py | 7 +- webapp/api/core/urls.py | 2 + .../src/components/common/DocumentSummary.vue | 16 -- .../src/components/common/ProjectList.vue | 144 ++++++++++++++++-- webapp/frontend/src/views/Home.vue | 2 +- .../frontend/src/views/TrainAnnotations.vue | 60 ++++---- webapp/scripts/run-bg-process.sh | 13 ++ webapp/scripts/run.sh | 2 - 17 files changed, 297 insertions(+), 85 deletions(-) create mode 100755 webapp/scripts/run-bg-process.sh diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index d03a7f7d..bf22d0a0 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -7,6 +7,7 @@ services: context: ./webapp args: SPACY_MODELS: ${SPACY_MODELS:-en_core_web_md} + image: medcattrainer-api restart: always volumes: - ./webapp/api/core:/home/api/core @@ -23,6 +24,25 @@ services: - MCT_VERSION=latest command: /home/scripts/run.sh + # bg process task runner + medcattrainer-bg-process: + image: medcattrainer-api + depends_on: + - medcattrainer + restart: always + volumes: + - ./webapp/api/core:/home/api/core + - ./webapp/api/api:/home/api/api + - ./webapp/scripts/run-bg-process.sh:/home/scripts/run-bg-process.sh + - ./configs:/home/configs + - api-media:/home/api/media + - api-static:/home/api/static + - api-db:/home/api/db + env_file: + - ./envs/env + command: /home/scripts/run-bg-process.sh + + nginx: image: nginx restart: always diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index 83247847..4d4d8c8a 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -20,6 +20,20 @@ services: - MCT_VERSION=v2.17.4 command: /home/scripts/run.sh + # bg process task runner + medcattrainer-bg-process: + image: cogstacksystems/medcat-trainer:v2.17.1 + restart: always + volumes: + - ./configs:/home/configs + - api-media:/home/api/media + - api-static:/home/api/static + - api-db:/home/api/db + - api-db-backup:/home/api/db-backup + env_file: + - ./envs/env + command: /home/scripts/run-bg-process.sh + # crontab - for db backup medcattrainer-db-backup: image: cogstacksystems/medcat-trainer:v2.17.4 diff --git a/docker-compose.yml b/docker-compose.yml index e6d624b9..b488a7d6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,6 +2,7 @@ # projects are not used. services: + # api server medcattrainer: image: cogstacksystems/medcat-trainer:v2.17.4 restart: always @@ -17,6 +18,20 @@ services: - MCT_VERSION=v2.17.4 command: /home/scripts/run.sh + # bg process task runner + medcattrainer-bg-process: + image: cogstacksystems/medcat-trainer:v2.17.4 + restart: always + volumes: + - ./configs:/home/configs + - api-media:/home/api/media + - api-static:/home/api/static + - api-db:/home/api/db + - api-db-backup:/home/api/db-backup + env_file: + - ./envs/env + command: /home/scripts/run-bg-process.sh + # crontab - for db backup medcattrainer-db-backup: image: cogstacksystems/medcat-trainer:v2.17.4 diff --git a/envs/env b/envs/env index 30743e74..b083d1e2 100644 --- a/envs/env +++ b/envs/env @@ -3,6 +3,8 @@ OPENBLAS_NUM_THREADS=1 ### MedCAT cfg ### MEDCAT_CONFIG_FILE=/home/configs/base.txt +# number of MedCAT models that can be cached, run in bg processes at any one time +MAX_MEDCAT_MODELS=2 ### Deployment Realm ### ENV=non-prod diff --git a/envs/env-prod b/envs/env-prod index 140c94b1..7ba55476 100644 --- a/envs/env-prod +++ b/envs/env-prod @@ -3,6 +3,8 @@ OPENBLAS_NUM_THREADS=1 ### MedCAT cfg ### MEDCAT_CONFIG_FILE=/home/configs/base.txt +# number of MedCAT models that can be cached, run in bg processes at any one time +MAX_MEDCAT_MODELS=2 ENV=prod # SECRET KEY - edit this for prod deployments, diff --git a/webapp/Dockerfile b/webapp/Dockerfile index f0c0ff09..f0e08e1a 100644 --- a/webapp/Dockerfile +++ b/webapp/Dockerfile @@ -37,3 +37,4 @@ RUN for SPACY_MODEL in ${SPACY_MODELS}; do python -m spacy download ${SPACY_MODE WORKDIR /home/api/ RUN chmod a+x /home/scripts/run.sh +RUN chmod a+x /home/scripts/run-bg-process.sh diff --git a/webapp/api/api/model_cache.py b/webapp/api/api/model_cache.py index 304ccf7c..68c41add 100644 --- a/webapp/api/api/model_cache.py +++ b/webapp/api/api/model_cache.py @@ -17,10 +17,22 @@ VOCAB_MAP = {} CAT_MAP = {} +_MAX_MODELS_LOADED = os.getenv("MAX_MEDCAT_MODELS", 1) logger = logging.getLogger(__name__) +def _clear_models(cdb_map: Dict[str, CDB]=CDB_MAP, + vocab_map: Dict[str, Vocab]=VOCAB_MAP, + cat_map: Dict[str, CAT]=CAT_MAP): + if len(cat_map) == _MAX_MODELS_LOADED: + (k := next(iter(cat_map)), cat_map.pop(k)) + if len(cdb_map) == _MAX_MODELS_LOADED: + (k := next(iter(cdb_map)), cdb_map.pop(k)) + if len(vocab_map) == _MAX_MODELS_LOADED: + (k := next(iter(vocab_map)), vocab_map.pop(k)) + + def get_medcat_from_cdb_vocab(project, cdb_map: Dict[str, CDB]=CDB_MAP, vocab_map: Dict[str, Vocab]=VOCAB_MAP, @@ -61,6 +73,7 @@ def get_medcat_from_cdb_vocab(project, vocab_map[vocab_id] = vocab cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab) cat_map[cat_id] = cat + _clear_models(cat_map=cat_map, cdb_map=cdb_map, vocab_map=vocab_map) return cat @@ -70,6 +83,7 @@ def get_medcat_from_model_pack(project, cat_map: Dict[str, CAT]=CAT_MAP) -> CAT: logger.info('Loading model pack from:%s', model_pack_obj.model_pack.path) cat = CAT.load_model_pack(model_pack_obj.model_pack.path) cat_map[cat_id] = cat + _clear_models(cat_map=cat_map) return cat diff --git a/webapp/api/api/utils.py b/webapp/api/api/utils.py index 03354f26..f434bb1a 100644 --- a/webapp/api/api/utils.py +++ b/webapp/api/api/utils.py @@ -241,17 +241,16 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int): project = ProjectAnnotateEntities.objects.get(id=project_id) docs = Document.objects.filter(id__in=doc_ids) - logger.info('Loading CAT object in bg process') + logger.info('Loading CAT object in bg process for project: %s', project.id) cat = get_medcat(project=project) # Set CAT filters cat.config.linking['filters']['cuis'] = project.cuis for doc in docs: - logger.info(f'Running MedCAT model over doc: {doc.id}') + logger.info(f'Running MedCAT model for project {project.id}:{project.name} over doc: {doc.id}') spacy_doc = cat(doc.text) anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project) - add_annotations(spacy_doc=spacy_doc, user=user, project=project, @@ -261,6 +260,8 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int): # add doc to prepared_documents project.prepared_documents.add(doc) project.save() + logger.info('Prepared all docs for project: %s, docs processed: %s', + project.id, project.prepared_documents) @receiver(post_save, sender=ProjectAnnotateEntities) diff --git a/webapp/api/api/views.py b/webapp/api/api/views.py index 027daa06..6a415125 100644 --- a/webapp/api/api/views.py +++ b/webapp/api/api/views.py @@ -4,6 +4,7 @@ from background_task.models import Task, CompletedTask from django.contrib.auth.views import PasswordResetView +from django.core.exceptions import ObjectDoesNotExist from django.http import HttpResponseBadRequest, HttpResponseServerError, HttpResponse from django.shortcuts import render from django.utils import timezone @@ -248,11 +249,6 @@ def prepare_documents(request): 'but is still set on the project. To fix remove and reset the ' 'cui filter file' % project.cuis_file}, status=500) - if request.data.get('bg_task'): - # execute model infer in bg - job = prep_docs(p_id, d_ids, user.id) - return Response({'bg_job_id': job.id}) - try: for d_id in d_ids: document = Document.objects.get(id=d_id) @@ -297,24 +293,59 @@ def prepare_documents(request): return Response({'message': 'Documents prepared successfully'}) +@api_view(http_method_names=['POST']) +def prepare_documents_bg(request): + user = request.user + # Get project id + p_id = request.data['project_id'] + project = ProjectAnnotateEntities.objects.get(id=p_id) + docs = Document.objects.filter(dataset=project.dataset) + + # Get docs that have no AnnotatedEntities + d_ids = [d.id for d in docs if len(AnnotatedEntity.objects.filter(document=d).filter(project=project)) == 0 or + d in project.validated_documents.all()] + + # execute model infer in bg + job = prep_docs(p_id, d_ids, user.id) + return Response({'bg_job_id': job.id}) + + @api_view(http_method_names=['GET']) -def prepare_docs_bg_tasks(request): - proj_id = int(request.GET['project']) +def prepare_docs_bg_tasks(_): running_doc_prep_tasks = Task.objects.filter(queue='doc_prep') completed_doc_prep_tasks = CompletedTask.objects.filter(queue='doc_prep') def transform_task_params(task_params_str): task_params = json.loads(task_params_str)[0] return { - 'document': task_params[1][0], + 'project': task_params[0], 'user_id': task_params[2] } - running_tasks = [transform_task_params(task.task_params) for task in running_doc_prep_tasks - if json.loads(task.task_params)[0][0] == proj_id] - complete_tasks = [transform_task_params(task.task_params) for task in completed_doc_prep_tasks - if json.loads(task.task_params)[0][0] == proj_id] + running_tasks = [transform_task_params(task.task_params) for task in running_doc_prep_tasks] + complete_tasks = [transform_task_params(task.task_params) for task in completed_doc_prep_tasks] return Response({'running_tasks': running_tasks, 'comp_tasks': complete_tasks}) + +@api_view(http_method_names=['GET', 'DELETE']) +def prepare_docs_bg_task(request, proj_id): + if request.method == 'GET': + # state of bg running process as determined by prepared docs + try: + proj = ProjectAnnotateEntities.objects.get(id=proj_id) + prepd_docs_count = proj.prepared_documents.count() + ds_total_count = Document.objects.filter(dataset=ProjectAnnotateEntities.objects.get(id=proj_id).dataset.id).count() + return Response({'proj_id': proj_id, 'dataset_len': ds_total_count, 'prepd_docs_len': prepd_docs_count}) + except ObjectDoesNotExist: + return HttpResponseBadRequest('No Project found for ID: %s', proj_id) + else: + running_doc_prep_tasks = {json.loads(task.task_params)[0][0]: task.id + for task in Task.objects.filter(queue='doc_prep')} + if proj_id in running_doc_prep_tasks: + Task.objects.filter(id=running_doc_prep_tasks[proj_id]).delete() + return Response("Successfully stopped running response") + else: + return HttpResponseBadRequest('Could not find running BG Process to stop') + @api_view(http_method_names=['POST']) def add_annotation(request): # Get project id @@ -623,7 +654,11 @@ def version(_): def concept_search_index_available(request): cdb_ids = request.GET.get('cdbs', '').split(',') cdb_ids = [c for c in cdb_ids if len(c)] - return collections_available(cdb_ids) + try: + return collections_available(cdb_ids) + except Exception as e: + return HttpResponseServerError("Solr Search Service not available check the service is up, running " + "and configured correctly. %s", e) @api_view(http_method_names=['GET']) diff --git a/webapp/api/core/settings.py b/webapp/api/core/settings.py index 0170c729..e81841c6 100644 --- a/webapp/api/core/settings.py +++ b/webapp/api/core/settings.py @@ -20,9 +20,10 @@ # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -environ_origins = os.environ.get('CSRF_TRUSTED_ORIGINS', None) -trusted_origins = [] if environ_origins is None else environ_origins.split(',') -CSRF_TRUSTED_ORIGINS = ['https://127.0.0.1:8001', 'http://localhost:8001'] + trusted_origins +environ_origins = os.environ.get('CSRF_TRUSTED_ORIGINS', '') +trusted_origins = [origin.strip() for origin in environ_origins.split(',') if origin.strip()] + +CSRF_TRUSTED_ORIGINS = ['http://127.0.0.1:8001', 'http://localhost:8001'] + trusted_origins SECURE_CROSS_ORIGIN_OPENER_POLICY = None diff --git a/webapp/api/core/urls.py b/webapp/api/core/urls.py index 4b12c3e6..fa351914 100644 --- a/webapp/api/core/urls.py +++ b/webapp/api/core/urls.py @@ -27,7 +27,9 @@ path('api/anno-conf/', api.views.get_anno_tool_conf), path('api/search-concepts/', api.views.search_solr), path('api/prepare-documents/', api.views.prepare_documents), + path('api/prepare-documents-bg/', api.views.prepare_documents_bg), path('api/prep-docs-bg-tasks/', api.views.prepare_docs_bg_tasks), + path('api/prep-docs-bg-tasks//', api.views.prepare_docs_bg_task), path('api/api-token-auth/', auth_views.obtain_auth_token), path('admin/', admin.site.urls), path('api/api-auth/', include('rest_framework.urls', namespace='rest_framework')), diff --git a/webapp/frontend/src/components/common/DocumentSummary.vue b/webapp/frontend/src/components/common/DocumentSummary.vue index 5c17b69e..3f1e97a2 100644 --- a/webapp/frontend/src/components/common/DocumentSummary.vue +++ b/webapp/frontend/src/components/common/DocumentSummary.vue @@ -68,23 +68,7 @@ export default { runningBgTasks: [] } }, - created() { - // this.pollDocPrepStatus(true) - }, methods: { - pollDocPrepStatus (pollInfinite) { - if (this.projId) { - this.$http.get(`/api/prep-docs-bg-tasks/?project=${this.projId}`).then(resp => { - this.runningBgTasks = resp.data.running_tasks.map(d => d.document) - this.completeBgTasks = resp.data.comp_tasks.map(d => d.document) - }) - if (pollInfinite) { - setTimeout(this.pollDocPrepStatus, 5000) - } - } else { - setTimeout(this.pollDocPrepStatus, 5000) - } - }, scrollSelectedDocId () { const el = document.getElementsByClassName('selected-doc') if (el.length > 0) { diff --git a/webapp/frontend/src/components/common/ProjectList.vue b/webapp/frontend/src/components/common/ProjectList.vue index c45b5352..b5c6eabc 100644 --- a/webapp/frontend/src/components/common/ProjectList.vue +++ b/webapp/frontend/src/components/common/ProjectList.vue @@ -1,5 +1,5 @@