JetBrains · dependabot · Nov 3, 2022 · Nov 3, 2022 · Nov 4, 2022 · Nov 4, 2022
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.DS_Store
+.idea
+/node_modules
+materials
+src/front/node_modules/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 JetBrains
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,31 +1,175 @@
-# openai/lm-astronomy
+# lm-astronomy API
 
+[![research JetBrains project](https://jb.gg/badges/research.svg)](https://confluence.jetbrains.com/display/ALL/JetBrains+on+GitHub)
 
+The `lm-astronomy` API is designed to facilitate the searching and cross-referencing of GCN Circulars and the Astronomer's Telegram (ATel) messages. Currently, our dataset includes 15k ATel and 31k GCN messages. In case you wish to further extend the dataset, we provide a pipeline for preprocessing additional messages.
 
-## Getting Started
+## Features
 
-Download links:
+Using the API, you can search, and filter the messages using the following parameters:
 
-SSH clone URL: ssh://[email protected]/astroparticle-physics/openai/lm-astronomy.git
+- `object_type`: This references a predetermined set of object types. Please consult the API documentation for a complete list.
+- `event_type`: References a specific set of event types predefined in the system. You can find the complete list in the API documentation.
+- `object_name`: This allows you to search by the particular name of the astronomical object.
+- `coordinates`: You can supply the coordinates of the area of interest to find messages regarding events that occured in that region. The coordinates should be provided in ICRS format. If you provide the coordinates, you must also specify a radius (`radius`) to search for messages that mention coordinates within that radius.
+- `messenger_type`: This pertains to the type of messenger associated with the event. Here, you can filter by four types: `electromagnetic radiation`, `gravitational waves`, `neutrinos`, and `cosmic rays`.
 
-HTTPS clone URL: https://git.jetbrains.space/astroparticle-physics/openai/lm-astronomy.git
+## Local Setup
 
+To run this API locally, use:
 
+```
+docker compose build && docker compose up
+```
+
+## API Endpoints Examples
+
+The following are examples of how to make requests to the API endpoints:
+
+- To filter by both `event_type` and `object_type`:
+
+```
+curl -X 'GET' \
+  'https://lm-astronomy.labs.jb.gg/api/search/?radius=3&event_type=High%20Energy%20Event&object_type=Supernova' \
+  -H 'accept: application/json'
+```
+
+- To search within a given radius of specific coordinates:
+
+```
+curl -X 'GET' \
+  'https://lm-astronomy.labs.jb.gg/api/search/?coordinates=266.76%20-28.89&radius=5' \
+  -H 'accept: application/json'
+```
+
+You can provide coordinates using the Equatorial coordinate system, expressed either in decimal degrees or sexagesimal format. The coordinates should be without commas and explicit units. To see an example, you can consult [NASA'S HEASARC](https://heasarc.gsfc.nasa.gov/Tools/name_or_coordinates_help.html) (the first three examples).
+**Please note**: To make a request using object coordinates, you must also provide a radius parameter.
+
+- To get metadata for a specific ATel message:
+
+```
+curl -X 'GET' \
+  'https://lm-astronomy.labs.jb.gg/api/atel/14778' \
+  -H 'accept: application/json'
+```
+
+# Entity Extraction Pipeline
+
+This pipeline extracts entities from astronomical data from GCN Circulars and the Astronomer's Telegram (ATel).
+
+Source data:
+
+- [GCN Circulars](https://gcn.gsfc.nasa.gov)
+- [ATel](https://www.astronomerstelegram.org)
+
+## Setup
+
+Follow these steps to set up your environment.
+
+### Step 1: Add OpenAI API Key
+
+Firstly, you need to add the OpenAI API key to your environment variables.
+Please replace `{key}` with your OpenAI key.
+
+```sh
+export openai_api_key={key}
+```
+
+### Step 2: Install Requirements
+
+Make sure all necessary Python packages are installed.
+
+```sh
+pip install -r scripts/requirements.txt
+pip install -r api/requirements.txt
+```
 
-These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
+## Pipeline Usage
 
-## Prerequisites
+### 1. Crawlers (`atel_crawler.py` and `gcn_crawler.py`)
 
-What things you need to install the software and how to install them.
+These scripts update the ATel and GCN datasets stored in `data/{dataset_name}/dataset.json`. Run using the following
+commands:
 
 ```
-Examples
+python3 atel_crawler.py
+python3 gcn_crawler.py
 ```
 
-## Deployment
+### 2. Generate Embeddings using OpenAI API (`gen_embeddings.py`)
+
+Use this script to generate embeddings for given messages.
+
+1. For generating embeddings for the dataset messages, run:
+
+```
+python3 scripts/gen_embeddings.py -d {dataset_name} -e text-similarity-davinci-001 -i {indices_path}
+```
+
+Results will be written in the `data/{dataset_name}/` folder.
+
+2. For generating embeddings for entities, run:
+
+```
+python3 scripts/gen_embeddings.py -d {dataset_name} -e text-embedding-ada-002 -en {entity_name} -i {indices_path}
+```
+
+Results will be written in the `data/{dataset_name}/{entity_name}/` folder.
+Folder `data/weights/` contains weights of a
+fine-tuned [pearsonkyle/gpt2-exomachina](https://huggingface.co/pearsonkyle/gpt2-exomachina) which can be used for
+generating embeddings/
+
+### 3. Generate Completions using OpenAI API (`gen_completions.py`)
+
+This script generates completions for given messages for such entities
+as `messenger_type`, `coordinates`,`object_name_or_event_ID`, `object_type`, `event_type`, `coordinate_system`. Run
+using:
+
+```
+python3 scripts/gen_completions.py --dataset {dataset_name} -e gpt-4-0613 -i {indices_path}
+```
+
+Results will be written in the `data/{dataset_name}/function_completions.json` file.
+
+### 4. Entity Extractor (`extract_data.py`)
+
+This script extracts entities for given dataset messages. The entities are divided into two types. Certain types of
+entities such as `event_type`, `object_type`, and `object_name_or_event_ID` are ranked using a feed-forward
+network (`ffn_inference.py`). Other entities like `messenger_type`, `coordinates`, `coordinate_system`, `date` are
+extracted using the openai API or directly from the text. Run using:
+
+```
+python3 scripts/extract_data.py -d {dataset_name} -i {indices_path}
+```
+
+Results will be added to the `data/{dataset_name}/entities.json` file.
+Note, that for the data to be extracted, embeddings and completions for given dataset messages should already be
+computed using `gen_embeddings.py` and `gen_completions.py`
+
+#### 4.1 Grouped Entity Extraction
+
+In order to simplify the process of searching through messages, we categorized the most significant types of events and
+objects. Subsequently, we associated the event_type and object_type entities obtained via FFN with these categories.
+This was done to decrease the variety of potential event and object types. This is accomplished through
+the `get_grouped_entities` function.
+
+Do note, however, that the categorization process could occasionally produce groups outside of
+the pre-determined ones.
+
+### 5. FFN Inference (`ffn_inference.py`)
+
+This script ranks embeddings for `object_name_or_event_ID`, `event_type` and `object_type` using the trained
+feed-forward network [[1]](#1). Directories in `data/ffn/` contain feed-forward network weights and are named
+accordingly to their output dimensions. Run using:
+
+```
+python3 scripts/ffn_inference.py -d {dataset_name} -en {entity_name} -i {indices_path}
+```
 
-Add additional notes about how to deploy this on a production system.
+**Note:** In each script, `-i` specifies the file with indices. If it is not provided, indices will be calculated as the
+difference between indices in the `data/{dataset_name}/dataset.json` and `data/{dataset_name}/entities.json` files.
 
-## Resources
+## References
 
-Add links to external resources for this project, such as CI server, bug tracker, etc.
+1. Sotnikov, V.; Chaikova, A. Language Models for Multimessenger Astronomy. Galaxies 2023, 11,
+    63. [https://doi.org/10.3390/galaxies11030063](https://doi.org/10.3390/galaxies11030063).
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,8 +1,10 @@
+version: '3'
+
 services:
   api:
     image: ${IMAGE_PATH:-api}:${IMAGE_VERSION:-latest}
     build:
       context: .
       dockerfile: docker/api/Dockerfile
     ports:
-      - "8000:8000"
+      - "8000:8000"
diff --git a/docker/api/Dockerfile b/docker/api/Dockerfile
@@ -1,9 +1,20 @@
+FROM node:14 AS FRONT
+
+WORKDIR /app
+
+RUN npm install -g serve
+COPY src/front .
+RUN npm install
+RUN npm run build
+
+
 FROM python:3.11
 
 # Install dependencies
 COPY docker/api/requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt
-COPY src /app
+COPY src/api /app/api
+COPY --from=FRONT /app/build /app/static
 WORKDIR /app
 
-CMD ["python", "-m", "gunicorn", "-w", "6", "-t", "90", "-k", "uvicorn.workers.UvicornWorker", "-b", "0.0.0.0:8000", "api.main:app"]
+CMD ["python", "-m", "gunicorn", "-w", "2", "-t", "90", "-k", "uvicorn.workers.UvicornWorker", "-b", "0.0.0.0:8000", "api.main:app"]
diff --git a/docker/api/requirements.txt b/docker/api/requirements.txt
@@ -1,3 +1,7 @@
 fastapi==0.85.2
 uvicorn==0.18.3
-gunicorn==20.1.0
+gunicorn==20.1.0
+astropy==5.3.1
+rdflib==7.0.0
+aiohttp==3.8.6
+scipy==1.11.3
diff --git a/src/api/atel.py b/src/api/atel.py
@@ -0,0 +1,50 @@
+import json
+from datetime import datetime
+from typing import Optional
+
+import aiohttp
+from pydantic import BaseModel
+from rdflib import Graph
+
+
+class ATelRecord(BaseModel):
+    author: str
+    creator: str
+    creatoremail: str
+    description: str
+    identifier: str
+    link: str
+    provenance: str
+    publisher: str
+    title: str
+    date: datetime
+
+
+async def load_record(record_id: str) -> Optional[ATelRecord]:
+    """Load the RDF record from the ATel website and convert to dict."""
+    url = f'https://www.astronomerstelegram.org/?rss+{record_id}'
+    # if not status 200, return None
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            if response.status != 200:
+                return None
+            response_rss_xml = await response.text()
+    g = Graph()
+    g.parse(data=response_rss_xml, format='xml')
+    record_raw = json.loads(g.serialize(format='json-ld'))
+    # choose the longest sublist
+    record_raw = max(record_raw, key=lambda x: len(x))
+    record = {}
+    for key, value in record_raw.items():
+        if key.startswith('http://purl.org/rss/1.0/'):
+            record[key[len('http://purl.org/rss/1.0/'):]] = value[0]['@value'].strip()
+    record['date'] = datetime.fromisoformat(record_raw['http://purl.org/dc/elements/1.1/date'][0]['@value'].strip())
+    atel_record = ATelRecord(**record)
+    return atel_record
+
+
+if __name__ == '__main__':
+    import asyncio
+
+    loop = asyncio.get_event_loop()
+    print(loop.run_until_complete(load_record('15')))