Skip to content

Commit

Permalink
Merge pull request #276 from nexB/224-matching-on-purldb-traefik
Browse files Browse the repository at this point in the history
Create server-side matching pipeline #224
  • Loading branch information
JonoYang authored Jan 31, 2024
2 parents baf41d2 + f2f4b81 commit b5b6ba9
Show file tree
Hide file tree
Showing 46 changed files with 2,224 additions and 173 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docs-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.9]
python-version: ["3.10"]

steps:
- name: Checkout code
Expand Down
23 changes: 20 additions & 3 deletions .github/workflows/purldb-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-20.04

services:
postgres:
postgres1:
image: postgres:13
env:
POSTGRES_DB: ${{ env.POSTGRES_DB }}
Expand All @@ -28,10 +28,25 @@ jobs:
ports:
- 5432:5432

postgres2:
image: postgres:13
env:
POSTGRES_DB: matchcodeio
POSTGRES_USER: matchcodeio
POSTGRES_PASSWORD: matchcodeio
POSTGRES_INITDB_ARGS: ${{ env.POSTGRES_INITDB_ARGS }}
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 5433:5432

strategy:
max-parallel: 4
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.10", "3.11"]

steps:
- name: Checkout code
Expand All @@ -50,5 +65,7 @@ jobs:
- name: Run tests
working-directory: .
run: |
make envfile
make envfile_testing
sudo mkdir /etc/scancodeio
sudo cp .env /etc/scancodeio
make test
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,5 @@ tcl

# Env Files
.env

var/
9 changes: 4 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,9 @@ RUN apt-get update \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

COPY setup.cfg setup.py /app/
RUN mkdir -p /app/matchcode-toolkit/src/
COPY matchcode-toolkit/setup.cfg matchcode-toolkit/setup.py /app/matchcode-toolkit/
RUN pip install -e matchcode-toolkit
RUN pip install -e .
# Install the dependencies before the codebase COPY for proper Docker layer caching
COPY setup.cfg setup.py requirements.txt /app/
RUN pip install --no-cache-dir -c requirements.txt .

# Copy the codebase
COPY . /app
7 changes: 6 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
graft src
graft clearcode
graft clearindex
graft matchcode
graft minecode
graft packagedb
graft purldb

include *.LICENSE
include NOTICE
Expand Down
28 changes: 26 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
# Python version can be specified with `$ PYTHON_EXE=python3.x make conf`
PYTHON_EXE?=python3
VENV=venv
MANAGE=${VENV}/bin/python manage.py
MANAGE=${VENV}/bin/python manage_purldb.py
MATCHCODE_MANAGE=${VENV}/bin/python manage_matchcode.py
ACTIVATE?=. ${VENV}/bin/activate;
VIRTUALENV_PYZ=../etc/thirdparty/virtualenv.pyz
# Do not depend on Python to generate the SECRET_KEY
Expand All @@ -19,6 +20,7 @@ GET_SECRET_KEY=`base64 /dev/urandom | head -c50`
ENV_FILE=.env
# Customize with `$ make postgres PACKAGEDB_DB_PASSWORD=YOUR_PASSWORD`
PACKAGEDB_DB_PASSWORD=packagedb
MATCHCODEIO_DB_PASSWORD=matchcodeio

# Use sudo for postgres, but only on Linux
UNAME := $(shell uname)
Expand Down Expand Up @@ -46,6 +48,13 @@ envfile:
@mkdir -p $(shell dirname ${ENV_FILE}) && touch ${ENV_FILE}
@echo SECRET_KEY=\"${GET_SECRET_KEY}\" > ${ENV_FILE}

envfile_testing:
@echo "-> Create the .env file and generate a secret key"
@if test -f ${ENV_FILE}; then echo ".env file exists already"; exit 1; fi
@mkdir -p $(shell dirname ${ENV_FILE}) && touch ${ENV_FILE}
@echo SECRET_KEY=\"${GET_SECRET_KEY}\" >> ${ENV_FILE}
@echo SCANCODEIO_DB_PORT=\"5433\" >> ${ENV_FILE}

isort:
@echo "-> Apply isort changes to ensure proper imports ordering"
${VENV}/bin/isort .
Expand Down Expand Up @@ -87,9 +96,23 @@ postgres:
${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=packagedb packagedb
@$(MAKE) migrate

postgres_matchcodeio:
@echo "-> Configure PostgreSQL database"
@echo "-> Create database user 'matchcodeio'"
${SUDO_POSTGRES} createuser --no-createrole --no-superuser --login --inherit --createdb matchcodeio || true
${SUDO_POSTGRES} psql -c "alter user matchcodeio with encrypted password '${MATCHCODEIO_DB_PASSWORD}';" || true
@echo "-> Drop 'matchcodeio' database"
${SUDO_POSTGRES} dropdb matchcodeio || true
@echo "-> Create 'matchcodeio' database"
${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=matchcodeio matchcodeio
${MATCHCODE_MANAGE} migrate

run:
${MANAGE} runserver 8001 --insecure

run_matchcodeio:
${MATCHCODE_MANAGE} runserver 8002 --insecure

seed:
${MANAGE} seed

Expand All @@ -107,8 +130,9 @@ process_scans:

test:
@echo "-> Run the test suite"
${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit --ignore purldb-toolkit --ignore packagedb/tests/test_throttling.py
${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit --ignore matchcode_pipeline --ignore matchcode_project --ignore purldb-toolkit --ignore packagedb/tests/test_throttling.py
${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs packagedb/tests/test_throttling.py
${ACTIVATE} DJANGO_SETTINGS_MODULE=matchcode_project.settings ${PYTHON_EXE} -m pytest -vvs matchcode_pipeline
${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs matchcode-toolkit --ignore matchcode-toolkit/src/matchcode_toolkit/pipelines
${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs purldb-toolkit

Expand Down
80 changes: 56 additions & 24 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ This repo consists of four main tools:
- MineCode that contains utilities to mine package repositories
- MatchCode that contains utilities to index package metadata and resources for
matching
- MatchCode.io that provides package matching functionalities for codebases
- ClearCode that contains utilities to mine Clearlydefined for package data

These are designed to be used first for reference such that one can query for
Expand Down Expand Up @@ -39,6 +40,7 @@ Once the prerequisites have been installed, set up PurlDB with the following com
make dev
make envfile
make postgres
make postgres_matchcodeio

Once PurlDB and the database has been set up, run tests to ensure functionality:
::
Expand All @@ -53,6 +55,11 @@ Start the PurlDB server by running:

make run

Start the MatchCode.io server by running:
::

make run_matchcodeio

To start visiting upstream package repositories for package metadata:
::

Expand All @@ -69,33 +76,13 @@ Populating Package Resource Data
The Resources of Packages can be collected using the scan queue. By default, a
scan request will be created for each mapped Package.

The following environment variables will have to be set for the scan queue
commands to work:
Given that you have access to a ScanCode.io instance, the following environment
variables will have to be set for the scan queue commands to work:
::

SCANCODEIO_URL=<ScanCode.io API URL>
SCANCODEIO_API_KEY=<ScanCode.io API Key>

``matchcode-toolkit`` will also have to be installed in the same environment as
ScanCode.io. If running ScanCode.io in a virtual environment from a git
checkout, you can install ``matchcode-toolkit`` in editable mode:
::

pip install -e <Path to purldb/matchcode-toolkit>

Otherwise, you can create a wheel from ``matchcode-toolkit`` and install it in
the ScanCode.io virutal environment or modify the ScanCode.io Dockerfile to
install the ``matchcode-toolkit`` wheel.

To build the ``matchcode-toolkit`` wheel:
::

# From the matchcode-toolkit directory
python setup.py bdist_wheel

The wheel ``matchcode_toolkit-0.0.1-py3-none-any.whl`` will be created in the
``matchcode-toolkit/dist/`` directory.

The scan queue is run using two commands:
::

Expand Down Expand Up @@ -136,8 +123,8 @@ matching indices from the collected Package data:
make index_packages


API Endpoints
-------------
PurlDB API Endpoints
--------------------

* ``api/packages``

Expand Down Expand Up @@ -172,6 +159,51 @@ API Endpoints
* Used to check the SHA1 values of archives from a scan to determine if they are known Packages


MatchCode.io
------------

MatchCode.io is a Django app, based off of ScanCode.io, that exposes one API
endpoint, ``api/matching``, which takes a ScanCode.io codebase scan, and
performs Package matching on it.

Currently, it performs three matching steps:

* Match codebase resources against the Packages in the PackageDB
* Match codebase resources against the Resources in the PackageDB
* Match codebase directories against the directory matching indices of
MatchCode


MatchCode.io API Endpoints
--------------------------

* ``api/matching``

* Performs Package matching on an uploaded ScanCode.io scan
* Intended to be used with the ``match_to_purldb`` pipeline in ScanCode.io


Docker Setup for Local Development and Testing
----------------------------------------------

PurlDB and MatchCode.io are two separate Django apps. In order to run both of
these Django apps on the same host, we need to use Traefik.

Traefik is an edge router that receives requests and finds out which services
are responsible for handling them. In the docker-compose.yml files for PurlDB
and MatchCode.io, we have made these two services part of the same Docker
network and set up the routes for each service.

All requests to the host go to the PurlDB service, but requests that go to the
``api/matching`` endpoint are routed to the MatchCode.io service.

To run PurlDB and Matchcode.io with Docker:
::

docker compose -f docker-compose_traefik.yml up -d
docker compose -f docker-compose_purldb.yml up -d
docker compose -f docker-compose_matchcodeio.yml up -d

Funding
-------

Expand Down
88 changes: 88 additions & 0 deletions docker-compose_matchcodeio.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
version: "3"

services:
matchcodeio_db:
image: postgres:13
env_file:
- docker_matchcodeio.env
volumes:
- matchcodeio_db_data:/var/lib/postgresql/data/

matchcodeio_redis:
image: redis
# Enable redis data persistence using the "Append Only File" with the
# default policy of fsync every second. See https://redis.io/topics/persistence
command: redis-server --appendonly yes
volumes:
- redis_data:/data
restart: always

matchcodeio_web:
build: .
command: wait-for-it --strict --timeout=60 db:5432 -- sh -c "
python manage_matchcode.py migrate --database default &&
python manage_matchcode.py collectstatic --no-input --verbosity 0 --clear &&
gunicorn matchcode_project.wsgi:application --bind :8001 --timeout 600 --workers 8"
environment:
- DJANGO_SETTINGS_MODULE=matchcode_project.settings
env_file:
- docker_matchcodeio.env
expose:
- 8001
volumes:
- .env:/opt/scancodeio/.env
- /etc/scancodeio/:/etc/scancodeio/
- workspace:/var/scancodeio/workspace/
- static:/var/scancodeio/static/

matchcodeio_worker:
build: .
# Ensure that potential db migrations run first by waiting until "web" is up
command: wait-for-it --strict --timeout=120 matchcodeio_web:8001 -- sh -c "
python manage_matchcode.py rqworker --worker-class scancodeio.worker.ScanCodeIOWorker
--queue-class scancodeio.worker.ScanCodeIOQueue
--verbosity 1"
environment:
- DJANGO_SETTINGS_MODULE=matchcode_project.settings
env_file:
- docker_matchcodeio.env
volumes:
- .env:/opt/scancodeio/.env
- /etc/scancodeio/:/etc/scancodeio/
- workspace:/var/scancodeio/workspace/
depends_on:
- matchcodeio_redis
- matchcodeio_web

matchcodeio_nginx:
image: nginx:alpine
volumes:
- ./etc/nginx/matchcodeio-conf.d/:/etc/nginx/conf.d/
- /var/www/html:/var/www/html
- static:/var/scancodeio/static/
depends_on:
- matchcodeio_web
restart: always
labels:
- "traefik.enable=true"
- "traefik.http.routers.matchcodeio.rule=
Host(`127.0.0.1`) && PathPrefix(`/api/matching`)
|| Host(`127.0.0.1`) && PathPrefix(`/api/runs`)
|| Host(`127.0.0.1`) && PathPrefix(`/project`)
|| Host(`127.0.0.1`) && PathPrefix(`/runs`)
|| Host(`localhost`) && PathPrefix(`/api/matching`)
|| Host(`localhost`) && PathPrefix(`/api/runs`)
|| Host(`localhost`) && PathPrefix(`/project`)
|| Host(`localhost`) && PathPrefix(`/runs`)"
- "traefik.http.routers.matchcodeio.entrypoints=web"

networks:
default:
name: purldb
external: true

volumes:
redis_data:
static:
workspace:
matchcodeio_db_data:
Loading

0 comments on commit b5b6ba9

Please sign in to comment.