From 9375be3f117facd96d24904e91e12e95eaff43b4 Mon Sep 17 00:00:00 2001 From: "Rob Knop (Nersc)" Date: Wed, 30 Oct 2024 10:37:35 -0700 Subject: [PATCH 1/5] Update spin files for rknop_dev environment to run on production spin cluster --- .../{tom-rknop-dev-app.yaml => tom-app.yaml} | 130 +++++++++++----- spin_admin/rknop-dev/tom-brokerpoll.yaml | 125 +++++++++++++++ ...{tom-rknop-dev-cert.yaml => tom-cert.yaml} | 4 +- ...postgres-pvc.yaml => tom-mongodb-pvc.yaml} | 6 +- spin_admin/rknop-dev/tom-mongodb.yaml | 144 ++++++++++++++++++ spin_admin/rknop-dev/tom-pgdump.yaml | 62 ++++++++ spin_admin/rknop-dev/tom-postgres-pvc.yaml | 13 ++ ...op-dev-postgres.yaml => tom-postgres.yaml} | 28 ++-- ...pruner-cron.yaml => tom-query-pruner.yaml} | 34 ++--- ...uery-runner.yaml => tom-query-runner.yaml} | 40 ++--- spin_admin/rknop-dev/tom-secrets.yaml | 27 ++++ .../rknop-dev/tom-send-alerts-cron.yaml | 101 ++++++++++++ ...knop-dev-app-shell.yaml => tom-shell.yaml} | 92 +++++++---- .../rknop-dev/tom-update-metrics-cron.yaml | 93 +++++++++++ 14 files changed, 776 insertions(+), 123 deletions(-) rename spin_admin/rknop-dev/{tom-rknop-dev-app.yaml => tom-app.yaml} (50%) create mode 100644 spin_admin/rknop-dev/tom-brokerpoll.yaml rename spin_admin/rknop-dev/{tom-rknop-dev-cert.yaml => tom-cert.yaml} (72%) rename spin_admin/rknop-dev/{tom-rknop-dev-postgres-pvc.yaml => tom-mongodb-pvc.yaml} (65%) create mode 100644 spin_admin/rknop-dev/tom-mongodb.yaml create mode 100644 spin_admin/rknop-dev/tom-pgdump.yaml create mode 100644 spin_admin/rknop-dev/tom-postgres-pvc.yaml rename spin_admin/rknop-dev/{tom-rknop-dev-postgres.yaml => tom-postgres.yaml} (83%) rename spin_admin/rknop-dev/{tom-rknop-dev-query-pruner-cron.yaml => tom-query-pruner.yaml} (74%) rename spin_admin/rknop-dev/{tom-rknop-dev-query-runner.yaml => tom-query-runner.yaml} (74%) create mode 100644 spin_admin/rknop-dev/tom-secrets.yaml create mode 100644 spin_admin/rknop-dev/tom-send-alerts-cron.yaml rename spin_admin/rknop-dev/{tom-rknop-dev-app-shell.yaml => tom-shell.yaml} (51%) create mode 100644 spin_admin/rknop-dev/tom-update-metrics-cron.yaml diff --git a/spin_admin/rknop-dev/tom-rknop-dev-app.yaml b/spin_admin/rknop-dev/tom-app.yaml similarity index 50% rename from spin_admin/rknop-dev/tom-rknop-dev-app.yaml rename to spin_admin/rknop-dev/tom-app.yaml index 6ef07373..b8fa18d2 100644 --- a/spin_admin/rknop-dev/tom-rknop-dev-app.yaml +++ b/spin_admin/rknop-dev/tom-app.yaml @@ -2,15 +2,15 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: tom-rknop-dev-app - namespace: desc-tom + name: tom-app + namespace: desc-tom-rknop-dev spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-app + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-app strategy: rollingUpdate: maxSurge: 1 @@ -19,31 +19,46 @@ spec: template: metadata: annotations: - cattle.io/timestamp: "2023-02-06T16:44:40Z" + cattle.io/timestamp: "2023-02-27T21:51:32Z" field.cattle.io/ports: '[[]]' nersc.gov/collab_uids: 70268,99896 nersc.gov/gid: "95089" - nersc.gov/gids: 45703,60152,57177,58102,59318,60070,63477,64483,79186,70268,92576,94721,95089,96414,99650 + nersc.gov/gids: 95089,57177,60152,96414 nersc.gov/roles: user nersc.gov/uid: "95089" nersc.gov/username: raknop creationTimestamp: null labels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-app + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-app spec: + affinity: {} containers: - env: - name: DB_HOST - value: tom-rknop-dev-postgres + value: tom-postgres - name: DB_NAME value: tom_desc - name: DB_PASS value: fragile - name: DB_USER value: postgres - image: registry.nersc.gov/m1727/raknop/tom_desc_bindmount:chimaera + - name: ALERCE_KAFKA_SERVER + value: b-3-public.publicproduction.o8ncxm.c18.kafka.us-east-1.amazonaws.com:9196,b-2-public.publicproduction.o8ncxm.c18.kafka.us-east-1.amazonaws.com:9196,b-1-public.publicproduction.o8ncxm.c18.kafka.us-east-1.amazonaws.com:9196 + - name: FINK_GROUP_ID + value: lsstfr-johann + - name: FINK_SERVER + value: 134.158.74.95:24499, + - name: FINK_TOPIC + value: fink_early_sn_candidates_ztf + - name: FINK_USERNAME + value: johann + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secrets/pitt_google_auth_key.json + - name: GOOGLE_CLOUD_PROJECT + value: elasticc-challenge + image: registry.nersc.gov/m1727/raknop/tom_server_bindmount imagePullPolicy: Always - name: tom-rknop-dev-app + name: tom-app resources: {} securityContext: allowPrivilegeEscalation: false @@ -62,11 +77,15 @@ spec: tty: true volumeMounts: - mountPath: /secrets - name: tom-rknop-dev-secrets + name: tom-secrets - mountPath: /tom_desc - name: tom-rknop-dev-deployment + name: tom-deployment + - mountPath: /code + name: tom-checkout - mountPath: /query_results - name: tom-rknop-dev-query-results + name: tom-query-results + - mountPath: /sample + name: tom-sample dnsConfig: {} dnsPolicy: ClusterFirst restartPolicy: Always @@ -77,33 +96,52 @@ spec: imagePullSecrets: - name: registry-nersc volumes: - - name: tom-rknop-dev-secrets + - name: tom-secrets secret: defaultMode: 256 optional: false - secretName: tom-rknop-dev-secrets + secretName: tom-secrets - hostPath: - path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/dev/tom_desc/tom_desc + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/tom_desc type: Directory - name: tom-rknop-dev-deployment + name: tom-checkout - hostPath: - path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/dev/query_results + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/tom_desc/tom_desc type: Directory - name: tom-rknop-dev-query-results + name: tom-deployment + - hostPath: + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/query_results + type: Directory + name: tom-query-results + - hostPath: + path: /global/cfs/cdirs/desc-td/ELASTICC2 + type: Directory + name: tom-sample --- apiVersion: v1 kind: Service metadata: - name: tom-rknop-dev-app - namespace: desc-tom + annotations: + field.cattle.io/targetWorkloadIds: '["deployment:desc-tom-rknop-dev:tom-app"]' + name: tom-app + namespace: desc-tom-rknop-dev + ownerReferences: + - apiVersion: apps/v1beta2 + controller: true + kind: deployment + name: tom-app + uid: e8f8e9fa-9bb1-475a-810d-483dbc7f6bc8 spec: + clusterIP: None + clusterIPs: + - None ports: - name: default port: 42 protocol: TCP targetPort: 42 selector: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-app + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-app sessionAffinity: None type: ClusterIP status: @@ -112,15 +150,23 @@ status: apiVersion: v1 kind: Service metadata: + annotations: + field.cattle.io/targetWorkloadIds: '["deployment:desc-tom-rknop-dev:tom-app"]' name: tom-ingress-service - namespace: desc-tom + namespace: desc-tom-rknop-dev + ownerReferences: + - apiVersion: v1beta1/extensions + controller: true + kind: Ingress + name: tom-app + uid: e8f8e9fa-9bb1-475a-810d-483dbc7f6bc8 spec: ports: - port: 8080 protocol: TCP targetPort: 8080 selector: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-app + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-app sessionAffinity: None type: ClusterIP status: @@ -130,27 +176,20 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: annotations: - nersc.gov/clustername: development + nersc.gov/clustername: production nginx.ingress.kubernetes.io/preserve-trailing-slash: "true" nginx.ingress.kubernetes.io/proxy-body-size: 500m nginx.ingress.kubernetes.io/proxy-read-timeout: "600" - generation: 3 + generation: 1 labels: cattle.io/creator: norman - name: tom-rknop-dev-app-ingress - namespace: desc-tom + name: tom-desc-app + namespace: desc-tom-rknop-dev + selfLink: /apis/networking.k8s.io/v1/namespaces/desc-tom-rknop-dev/ingresses/tom-app spec: + ingressClassName: nginx rules: - - host: tom-rknop-dev-app-ingress.desc-tom.development.svc.spin.nersc.org - http: - paths: - - backend: - service: - name: tom-ingress-service - port: - number: 8080 - pathType: ImplementationSpecific - - host: desc-tom-rknop-dev.lbl.gov + - host: tom-desc-app.desc-tom-rknop-dev.production.svc.spin.nersc.org http: paths: - backend: @@ -159,8 +198,17 @@ spec: port: number: 8080 pathType: ImplementationSpecific - tls: - - hosts: - - desc-tom-rknop-dev.lbl.gov - secretName: desc-tom-rknop-dev-cert +# - host: desc-tom-rknop-dev.lbl.gov +# http: +# paths: +# - backend: +# service: +# name: tom-ingress-service +# port: +# number: 8080 +# pathType: ImplementationSpecific +# tls: +# - hosts: +# - desc-tom-rknop-dev.lbl.gov +# secretName: tom-cert --- diff --git a/spin_admin/rknop-dev/tom-brokerpoll.yaml b/spin_admin/rknop-dev/tom-brokerpoll.yaml new file mode 100644 index 00000000..3c9aadff --- /dev/null +++ b/spin_admin/rknop-dev/tom-brokerpoll.yaml @@ -0,0 +1,125 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tom-brokerpoll + namespace: desc-tom-rknop-dev +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-brokerpoll + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate + template: + metadata: + annotations: + cattle.io/timestamp: "2023-02-06T16:44:40Z" + field.cattle.io/ports: '[[]]' + nersc.gov/collab_uids: 70268,99896 + nersc.gov/gid: "95089" + nersc.gov/gids: 45703,60152,57177,58102,59318,60070,63477,64483,79186,70268,92576,94721,95089,96414,99650 + nersc.gov/roles: user + nersc.gov/uid: "95089" + nersc.gov/username: raknop + creationTimestamp: null + labels: + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-brokerpoll + spec: + containers: + - env: + - name: DB_HOST + value: tom-postgres + - name: DB_NAME + value: tom_desc + - name: DB_PASS + value: fragile + - name: DB_USER + value: postgres + - name: CASSANDRA_HOST + value: tom-cassandra + - name: CASSANDRA_DB + value: tom_desc + - name: CASSANDRA_USER + value: cassandra + - name: CASSANDRA_PASSWORD + value: cassandra + - name: ALERCE_TOPIC_RELDATEOFFSET + value: '-36' + - name: ALERCE_KAFKA_SERVER + value: b-3-public.publicproduction.o8ncxm.c18.kafka.us-east-1.amazonaws.com:9196,b-2-public.publicproduction.o8ncxm.c18.kafka.us-east-1.amazonaws.com:9196,b-1-public.publicproduction.o8ncxm.c18.kafka.us-east-1.amazonaws.com:9196 + - name: FINK_GROUP_ID + value: lsstfr-johann + - name: FINK_SERVER + value: 134.158.74.95:24499, + - name: FINK_TOPIC + value: fink_early_sn_candidates_ztf + - name: FINK_USERNAME + value: johann + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secrets/pitt_google_auth_key.json + - name: GOOGLE_CLOUD_PROJECT + value: elasticc-challenge + image: registry.nersc.gov/m1727/raknop/tom_server_bindmount + imagePullPolicy: Always + name: tom-brokerpoll + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + add: + - NET_BIND_SERVICE + drop: + - ALL + privileged: false + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 95089 + stdin: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tty: true + volumeMounts: + - mountPath: /secrets + name: tom-secrets + - mountPath: /tom_desc + name: tom-deployment + command: [ "dumb-init" ] + args: + - -r + - 15:10 + - python + - manage.py + - brokerpoll2 + - --do-antares + - --antares-topic + - elasticc2-2 + - --do-pitt + - --pitt-project + - elasticc-challenge + - --pitt-topic + - elasticc-SuperNNova + - --do-alerce + dnsConfig: {} + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: + fsGroup: 57177 + terminationGracePeriodSeconds: 30 + imagePullSecrets: + - name: registry-nersc + volumes: + - name: tom-secrets + secret: + defaultMode: 256 + optional: false + secretName: tom-secrets + - hostPath: + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/tom_desc/tom_desc + type: Directory + name: tom-deployment diff --git a/spin_admin/rknop-dev/tom-rknop-dev-cert.yaml b/spin_admin/rknop-dev/tom-cert.yaml similarity index 72% rename from spin_admin/rknop-dev/tom-rknop-dev-cert.yaml rename to spin_admin/rknop-dev/tom-cert.yaml index b702d657..3ab7bb27 100644 --- a/spin_admin/rknop-dev/tom-rknop-dev-cert.yaml +++ b/spin_admin/rknop-dev/tom-cert.yaml @@ -4,6 +4,6 @@ data: tls.key: PUT THE RIGHT THING HERE kind: Secret metadata: - name: desc-tom-rknop-dev-cert - namespace: desc-tom + name: tom-cert + namespace: desc-tom-rknop-dev type: kubernetes.io/tls diff --git a/spin_admin/rknop-dev/tom-rknop-dev-postgres-pvc.yaml b/spin_admin/rknop-dev/tom-mongodb-pvc.yaml similarity index 65% rename from spin_admin/rknop-dev/tom-rknop-dev-postgres-pvc.yaml rename to spin_admin/rknop-dev/tom-mongodb-pvc.yaml index e72da341..2efdc5b6 100644 --- a/spin_admin/rknop-dev/tom-rknop-dev-postgres-pvc.yaml +++ b/spin_admin/rknop-dev/tom-mongodb-pvc.yaml @@ -1,13 +1,13 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: tom-rknop-dev-postgres-pvc - namespace: desc-tom + name: rknop-dev-mongodb-pvc-20241030 + namespace: desc-tom-rknop-dev spec: accessModes: - ReadWriteOnce resources: requests: - storage: 200Gi + storage: 2048Gi storageClassName: nfs-client volumeMode: Filesystem diff --git a/spin_admin/rknop-dev/tom-mongodb.yaml b/spin_admin/rknop-dev/tom-mongodb.yaml new file mode 100644 index 00000000..6f09f225 --- /dev/null +++ b/spin_admin/rknop-dev/tom-mongodb.yaml @@ -0,0 +1,144 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mongodb + namespace: desc-tom-rknop-dev +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-mongodb + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate + template: + metadata: + annotations: + field.cattle.io/ports: '[[]]' + nersc.gov/collab_uids: "70268" + nersc.gov/gid: "95089" + nersc.gov/gids: 95089,57177,60152 + nersc.gov/roles: user + nersc.gov/uid: "95089" + nersc.gov/username: raknop + creationTimestamp: null + labels: + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-mongodb + spec: + initContainers: + - args: + - -c + - chown 999:999 /data + command: + - /bin/sh + image: busybox + imagePullPolicy: Always + name: mongodb-volume-mount-hack + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /data + name: mongodb-datadir + containers: + - env: + - name: MONGO_INITDB_ROOT_USERNAME + valueFrom: + secretKeyRef: + key: mongodb_admin + name: tom-secrets + optional: false + - name: MONGO_INITDB_ROOT_PASSWORD + valueFrom: + secretKeyRef: + key: mongodb_admin_passwd + name: tom-secrets + optional: false + - name: MONGO_ALERT_WRITER_USERNAME + valueFrom: + secretKeyRef: + key: mongodb_alert_writer + name: tom-secrets + optional: false + - name: MONGO_ALERT_WRITER_PASSWORD + valueFrom: + secretKeyRef: + key: mongodb_alert_writer_password + name: tom-secrets + optional: false + - name: MONGO_ALERT_READER_USERNAME + valueFrom: + secretKeyRef: + key: mongodb_alert_reader + name: tom-secrets + optional: false + - name: MONGO_ALERT_READER_PASSWORD + valueFrom: + secretKeyRef: + key: mongodb_alert_reader_password + name: tom-secrets + optional: false + image: registry.nersc.gov/m1727/rknop/tom-mongodb:latest + imagePullPolicy: Always + name: mongodb + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + add: + - CHOWN + - DAC_OVERRIDE + - FOWNER + - SETGID + - SETUID + drop: + - ALL + privileged: false + readOnlyRootFilesystem: false + runAsNonRoot: false + stdin: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tty: true +# ports: +# - name: mongodb +# containerPort: 27017 +# protocol: TCP + volumeMounts: + - mountPath: /mongodb-data + name: mongodb-datadir + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + imagePullSecrets: + - name: registry-nersc + volumes: + - name: mongodb-datadir + persistentVolumeClaim: + claimName: rknop-dev-mongodb-pvc-20241030 +--- +apiVersion: v1 +kind: Service +metadata: + name: mongodb-service + namespace: desc-tom-rknop-dev +spec: + ports: + - name: default + port: 42 + protocol: TCP + targetPort: 42 + selector: + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-mongodb + sessionAffinity: None + type: ClusterIP +status: + loadBalancer: {} +--- diff --git a/spin_admin/rknop-dev/tom-pgdump.yaml b/spin_admin/rknop-dev/tom-pgdump.yaml new file mode 100644 index 00000000..8ea1d4cb --- /dev/null +++ b/spin_admin/rknop-dev/tom-pgdump.yaml @@ -0,0 +1,62 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: tom-pgdump + namespace: desc-tom-rknop-dev +spec: + concurrencyPolicy: Allow + failedJobsHistoryLimit: 1 + jobTemplate: + metadata: + creationTimestamp: null + spec: + template: + spec: + containers: + - command: + - /bin/bash + - /home/pgdump/run_pgdump.sh + env: + - name: PGDB + value: tom_desc + - name: PGHOST + value: tom-postgres + - name: PGPASSWORD + value: fragile + image: registry.nersc.gov/m1727/raknop/tom_pgdump + imagePullPolicy: Always + name: tom-pgdump + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + add: + - NET_BIND_SERVICE + drop: + - ALL + privileged: false + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 95089 + stdin: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tty: true + volumeMounts: + - mountPath: /pgdump + name: pgdump + workingDir: /pgdump + dnsPolicy: ClusterFirst + restartPolicy: Never + schedulerName: default-scheduler + securityContext: + fsGroup: 57177 + terminationGracePeriodSeconds: 30 + volumes: + - hostPath: + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/pg_dump + type: Directory + name: pgdump + schedule: 0 5 * * 0 + successfulJobsHistoryLimit: 3 + suspend: false diff --git a/spin_admin/rknop-dev/tom-postgres-pvc.yaml b/spin_admin/rknop-dev/tom-postgres-pvc.yaml new file mode 100644 index 00000000..6940d1f4 --- /dev/null +++ b/spin_admin/rknop-dev/tom-postgres-pvc.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tom-rknop-dev-postgres-pvc-20241030 + namespace: desc-tom-rknop-dev +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2048Gi + storageClassName: nfs-client + volumeMode: Filesystem diff --git a/spin_admin/rknop-dev/tom-rknop-dev-postgres.yaml b/spin_admin/rknop-dev/tom-postgres.yaml similarity index 83% rename from spin_admin/rknop-dev/tom-rknop-dev-postgres.yaml rename to spin_admin/rknop-dev/tom-postgres.yaml index 18331ad9..bc3d65a9 100644 --- a/spin_admin/rknop-dev/tom-rknop-dev-postgres.yaml +++ b/spin_admin/rknop-dev/tom-postgres.yaml @@ -2,15 +2,15 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: tom-rknop-dev-postgres - namespace: desc-tom + name: tom-postgres + namespace: desc-tom-rknop-dev spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-postgres + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-postgres strategy: rollingUpdate: maxSurge: 1 @@ -29,22 +29,22 @@ spec: nersc.gov/username: raknop creationTimestamp: null labels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-postgres + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-postgres spec: initContainers: - name: volume-mount-hack image: busybox command: [ "sh", "-c", "chown 100:108 /pgdata" ] volumeMounts: - - name: tom-rknop-dev-postgres-pvc + - name: tom-postgres mountPath: /pgdata containers: - env: - name: POSTGRES_DATA_DIR value: /var/lib/postgresql/data - image: registry.nersc.gov/m1727/raknop/tom-postgres:daedalus + image: registry.nersc.gov/m1727/raknop/tom_postgres imagePullPolicy: Always - name: tom-rknop-dev-postgres + name: tom-postgres resources: {} securityContext: allowPrivilegeEscalation: false @@ -66,7 +66,7 @@ spec: tty: true volumeMounts: - mountPath: /var/lib/postgresql/data - name: tom-rknop-dev-postgres-pvc + name: tom-postgres - mountPath: /dev/shm name: dshm dnsPolicy: ClusterFirst @@ -77,19 +77,19 @@ spec: imagePullSecrets: - name: registry-nersc volumes: - - name: tom-rknop-dev-postgres-pvc + - name: tom-postgres persistentVolumeClaim: - claimName: tom-rknop-dev-postgres-pvc + claimName: tom-rknop-dev-postgres-pvc-20241030 - emptyDir: medium: Memory - sizeLimit: 16Gi + sizeLimit: 128Gi name: dshm --- apiVersion: v1 kind: Service metadata: - name: tom-rknop-dev-postgres - namespace: desc-tom + name: tom-postgres + namespace: desc-tom-rknop-dev spec: clusterIP: None clusterIPs: @@ -100,7 +100,7 @@ spec: protocol: TCP targetPort: 42 selector: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-postgres + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-postgres sessionAffinity: None type: ClusterIP status: diff --git a/spin_admin/rknop-dev/tom-rknop-dev-query-pruner-cron.yaml b/spin_admin/rknop-dev/tom-query-pruner.yaml similarity index 74% rename from spin_admin/rknop-dev/tom-rknop-dev-query-pruner-cron.yaml rename to spin_admin/rknop-dev/tom-query-pruner.yaml index 414639cb..b1a3bd38 100644 --- a/spin_admin/rknop-dev/tom-rknop-dev-query-pruner-cron.yaml +++ b/spin_admin/rknop-dev/tom-query-pruner.yaml @@ -1,8 +1,8 @@ apiVersion: batch/v1 kind: CronJob metadata: - name: tom-rknop-dev-query-pruner - namespace: desc-tom + name: tom-query-pruner + namespace: desc-tom-rknop-dev spec: concurrencyPolicy: Forbid failedJobsHistoryLimit: 1 @@ -14,7 +14,7 @@ spec: field.cattle.io/ports: '[[]]' nersc.gov/collab_uids: "70268" nersc.gov/gid: "95089" - nersc.gov/gids: 95089,60152,57177 + nersc.gov/gids: 95089,60152,57177,96414 nersc.gov/roles: user nersc.gov/uid: "95089" nersc.gov/username: raknop @@ -22,19 +22,19 @@ spec: spec: containers: - command: ['python'] - args: ['manage.py', 'long_query_runner', '-p', '1'] + args: ['manage.py', 'long_query_runner', '-p', '4'] env: - name: DB_HOST - value: tom-rknop-dev-postgres + value: tom-postgres - name: DB_NAME value: tom_desc - name: DB_PASS value: fragile - name: DB_USER value: postgres - image: registry.nersc.gov/m1727/raknop/tom_desc_bindmount:daedalus + image: registry.nersc.gov/m1727/raknop/tom_server_bindmount imagePullPolicy: Always - name: tom-rknop-dev-query-pruner + name: tom-query-pruner resources: {} securityContext: allowPrivilegeEscalation: false @@ -53,11 +53,11 @@ spec: tty: true volumeMounts: - mountPath: /secrets - name: tom-rknop-dev-secrets + name: tom-secrets - mountPath: /tom_desc - name: tom-rknop-dev-deployment + name: tom-deployment - mountPath: /query_results - name: tom-rknop-dev-query-results + name: tom-query-results workingDir: /tom_desc dnsPolicy: ClusterFirst restartPolicy: Never @@ -68,19 +68,19 @@ spec: imagePullSecrets: - name: registry-nersc volumes: - - name: tom-rknop-dev-secrets + - name: tom-secrets secret: defaultMode: 256 optional: false - secretName: tom-rknop-dev-secrets + secretName: tom-secrets - hostPath: - path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/dev/tom_desc/tom_desc + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/tom_desc/tom_desc type: Directory - name: tom-rknop-dev-deployment + name: tom-deployment - hostPath: - path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/dev/query_results + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/query_results type: Directory - name: tom-rknop-dev-query-results - schedule: 15 8 * * * + name: tom-query-results + schedule: 20 8 * * * successfulJobsHistoryLimit: 3 suspend: false diff --git a/spin_admin/rknop-dev/tom-rknop-dev-query-runner.yaml b/spin_admin/rknop-dev/tom-query-runner.yaml similarity index 74% rename from spin_admin/rknop-dev/tom-rknop-dev-query-runner.yaml rename to spin_admin/rknop-dev/tom-query-runner.yaml index 3edf180d..7c535973 100644 --- a/spin_admin/rknop-dev/tom-rknop-dev-query-runner.yaml +++ b/spin_admin/rknop-dev/tom-query-runner.yaml @@ -2,15 +2,15 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: tom-rknop-dev-query-runner - namespace: desc-tom + name: tom-query-runner + namespace: desc-tom-rknop-dev spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-query-runner + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-query-runner strategy: rollingUpdate: maxSurge: 1 @@ -22,27 +22,27 @@ spec: field.cattle.io/ports: '[[]]' nersc.gov/collab_uids: 70268,99896 nersc.gov/gid: "95089" - nersc.gov/gids: 45703,60152,57177,58102,59318,60070,63477,64483,79186,70268,92576,94721,95089,96414,99650 + nersc.gov/gids: 95089,57177,60152,96414 nersc.gov/roles: user nersc.gov/uid: "95089" nersc.gov/username: raknop creationTimestamp: null labels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-query-runner + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-query-runner spec: containers: - env: - name: DB_HOST - value: tom-rknop-dev-postgres + value: tom-postgres - name: DB_NAME value: tom_desc - name: DB_PASS value: fragile - name: DB_USER value: postgres - image: registry.nersc.gov/m1727/raknop/tom_desc_bindmount:daedalus + image: registry.nersc.gov/m1727/raknop/tom_server_bindmount imagePullPolicy: Always - name: tom-rknop-dev-query-runner + name: tom-query-runner resources: {} securityContext: allowPrivilegeEscalation: false @@ -61,11 +61,11 @@ spec: tty: true volumeMounts: - mountPath: /secrets - name: tom-rknop-dev-secrets + name: tom-secrets - mountPath: /tom_desc - name: tom-rknop-dev-deployment + name: tom-deployment - mountPath: /query_results - name: tom-rknop-dev-query-results + name: tom-query-results command: [ 'python' ] args: [ 'manage.py', 'long_query_runner', '-l' ] dnsConfig: {} @@ -78,25 +78,25 @@ spec: imagePullSecrets: - name: registry-nersc volumes: - - name: tom-rknop-dev-secrets + - name: tom-secrets secret: defaultMode: 256 optional: false - secretName: tom-rknop-dev-secrets + secretName: tom-secrets - hostPath: - path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/dev/tom_desc/tom_desc + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/tom_desc/tom_desc type: Directory - name: tom-rknop-dev-deployment + name: tom-deployment - hostPath: - path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/dev/query_results + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/query_results type: Directory - name: tom-rknop-dev-query-results + name: tom-query-results --- apiVersion: v1 kind: Service metadata: - name: tom-rknop-dev-query-runner - namespace: desc-tom + name: tom-query-runner + namespace: desc-tom-rknop-dev spec: ports: - name: default @@ -104,7 +104,7 @@ spec: protocol: TCP targetPort: 42 selector: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-query-runner + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-query-runner sessionAffinity: None type: ClusterIP status: diff --git a/spin_admin/rknop-dev/tom-secrets.yaml b/spin_admin/rknop-dev/tom-secrets.yaml new file mode 100644 index 00000000..9903809a --- /dev/null +++ b/spin_admin/rknop-dev/tom-secrets.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +data: + pitt_google_auth_key.json: ewogICJ0eXBlIjogInNlcnZpY2VfYWNjb3VudCIsCiAgInByb2plY3RfaWQiOiAiZWxhc3RpY2MtY2hhbGxlbmdlIiwKICAicHJpdmF0ZV9rZXlfaWQiOiAiNGM1YjM4MmQxNWQ0YzE1ZWI5ZDU0MDc4NWE5Yjk4ZTU2MzcxMmQxOCIsCiAgInByaXZhdGVfa2V5IjogIi0tLS0tQkVHSU4gUFJJVkFURSBLRVktLS0tLVxuTUlJRXZnSUJBREFOQmdrcWhraUc5dzBCQVFFRkFBU0NCS2d3Z2dTa0FnRUFBb0lCQVFEVTZaYXQ5ZDhmNzVGU1xuS2g3RGtmSktQdFFINGVVYU03dU5RaHcyeXdRM0VMcUdIWDB1OVlYVDdnK3VrbzZSeGMzdU9vbU5ldkljTnBGblxubnBZNi9meHFiR0pUSjA4UXBxM01NTzJpZk13ZDdNVS9Tc2tQeHNYcG9nS2UvczVySVJRY2dNSE9sdUxiRlEzSlxuRWxKMGJsYytBMWlZczFJd1dZVm9UYnd2WkhQK0IxenF5Uk1ndnpwb3pkRG1UaC92Tk9QcU0wNVJlUHl2VjNDVlxuOW1XNWtyTFVuZVA2RGFkZjBuNk9tRVh4WVBiaFpTQ2dwMEpVbDkxdUhtYjlHaU9tQzBtNDVkYmtlWGJjMjdqQ1xuTlAwWWorYTREVzA3aUlyV1JVZWVhc2luK0loZWlsQ0NETDl1dmpGbXZWS3NRT0hKREgrUkVrb3Q3TW43clRFT1xuK3BEbkYwUXRBZ01CQUFFQ2dnRUFMUDE1SmlSYzlEb2kycHBrNnMyVnhjT1YrVjV1RTRYWC9XYjVZRWl4Q05QRFxubm1CdjJmTHV4eFE0Z3hxYi9zaXVGY2VQMW5JaTQveFI2SFAxNmR4T3NNeGJVSmZ2V0tUbjk4eGxBcDlkWUh0dVxuU013TVJZV3F2NDVKQUlEMmJHUE9oS0Zla3NHR2lXWGZCbHlWU040aUdFd0x0aVM2V3RZbjN4MENCWCswRlBybFxuWmhPTDZmTk94Zk53UTM4YjdWeWxNUXM0MW1ZUEpNM2dPVHE3UGpjRytCZEkvNU85YitDZ2tHN3RqKzM5S25qaFxuSUlLWGpoOVRlSmRRcFNjV2lyVmE0Yi92YmJ1QkJtc2g4WDFyR2lvV2I1QTJQY0tJQWl3bUFWeHlKTnFBU2N2TlxuL2RIZFdtb1krazFONDkwc09pNVpGSW5DMGhKS2tSbHJ2RDJXMnBCL1lRS0JnUUQ2K2x3S2dBYmhkenRTY1RiR1xuZi9ZZGd4M1h3MHA3a0NueTRWSzJPVHNxRGVMeDJlWUw1UDY1UElRcnU5d2FmS3lwOXBGVzlUTWt4eXdZYkFUcFxuOVZUVzdKS0dOM2xWdDBkUm9WcWxwOXFpUVdHaUtaYnVCaHNwK0hvL2QzS3NURTJaMXdxdGRKcGhyM3Q2bHdrQlxuZVYyUm0ycWF4U3RJVTVFdkpFZ051KzJGMFFLQmdRRFpMRHpTc2hlMkdRdHUzYWlDbWxXTWFNdWRYUy9qMlBXTlxuZU5hOEJzUHViV0pHOS9HY05UTU9nNHZxK3dkenAyTjdrSzZoQ1pXeWxNS3U1K1dlOVlKa0VDVFI5a25kU1hualxuT0pBOVhWVGN3OUdTajBXdVNhUUpDNitrUE9mWm0yMXZ5cE0rc3IreHpHc045TjMxdGhNVW5ZdXNNaDZCZVUyL1xuR05kTUd4N0RuUUtCZ0ZxekIzZ3o5RWllMGkxbEJLcVBxOTNXZUg1MDhRSGVnNHBtcTRGM0JVNlpYaDZRNUhYL1xuOTUzeHBNSTVUYjIwdUtGam5mcVMranljVGU4MWVrRWlpOXB2L0RQTThHaHRwN1IwWDFIbWF6aWY4Q3l1WXdENVxuNis4ckFPTFd3L1pJVFBML3A3Z3ZuMnNXQThXMXQyZDArTCthTHRobjBOQitnc1ZLazArRFRyOUJBb0dCQU1OYVxuRDd4WGh3eStxcElud1pUNXowcG1XZ0pMK1BBMEVXNFk3TkJtSllrcnkrT2ExVE4yaDE3ZTVkV1J5Ym8zTUQzT1xuZjhkTUgrUnIvRlBwTlp3dnJBTG1vcmhibTBYNitqRnlpQTByRDFNbXNqWHJ1dnAwTTZJUGFRZUw0ZUxVOVRSQ1xuUFFHQk9QNXVxejM2cms2QVlkbUJma3AvS3dHOFpEc3AxVUVZaEN2aEFvR0JBTWRYN2FHei94WXhYdFl1SVZsY1xua25Kd1dHZGRsdzhHMUlWOHcycmozZFJha2NRNCthOFg4T2c1Z045UlRGdDc4V2VVcmVBRGUrWFRwSGpoU1BWTFxuN0NudEoxZWkzcURiYm9CZFZGTUV4akROQ3BFL3JaMmdVNmpBQWRmS2pKRi9FTmd6aW5yM3kwWjdZcy9tUUtLT1xuV1JzSlNQWEIwd204UWJSR0YxT3FXMXdpXG4tLS0tLUVORCBQUklWQVRFIEtFWS0tLS0tXG4iLAogICJjbGllbnRfZW1haWwiOiAiZGVzYy10b21AZWxhc3RpY2MtY2hhbGxlbmdlLmlhbS5nc2VydmljZWFjY291bnQuY29tIiwKICAiY2xpZW50X2lkIjogIjEwNTk3ODQyOTA3NDg4MDIxMzI1OCIsCiAgImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKICAidG9rZW5fdXJpIjogImh0dHBzOi8vb2F1dGgyLmdvb2dsZWFwaXMuY29tL3Rva2VuIiwKICAiYXV0aF9wcm92aWRlcl94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL29hdXRoMi92MS9jZXJ0cyIsCiAgImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvZGVzYy10b20lNDBlbGFzdGljYy1jaGFsbGVuZ2UuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAogICJ1bml2ZXJzZV9kb21haW4iOiAiZ29vZ2xlYXBpcy5jb20iCn0K + alerce_passwd: ZWxhc3RpY2N2Mgo= + alerce_username: ZWxhc3RpY2M= + antares_passwd: Vmp4YF5uQX5RUVNORWg1NmVeTVo0amAjeHp5dVVpJiM3UVBoeXNHNG9OYWJ2fjRvTSM3UkAjd2lNMlNoMjNLNQ== + antares_username: Uk1XTUJHUEtIV0hGTUdVVQ== + django_secret_key: TE9SN1hUdGh2MjNRZmFIYmlkUHVBREdFQmVKTXBsbGk= + postgres_elasticc_admin_ro_password: bmZuaWhuYWs4dThq + postgres_elasticc_ro_password: MjB2dDVkdHVhYTBv + postgres_password: ZnJhZ2lsZQ== + postgres_ro_password: aDI2cmowMHY0b3Ax + mongodb_admin: bW9uZ29kYl9hZG1pbg== + mongodb_admin_passwd: eWp0Zmw3eWRvcnA4Cg== + mongodb_alert_reader: bW9uZ29kYl9hbGVydF9yZWFkZXI= + mongodb_alert_reader_password: aXdkeTY2bHBpOTc3Cg== + mongodb_alert_writer: bW9uZ29kYl9hbGVydF93cml0ZXI= + mongodb_alert_writer_password: bnkxNGl0NjE0emo3Cg== + ppdb_reader: cHBkYl9yZWFkZXI= + ppdb_reader_password: YzRzMzhjdHNwd2ltCg== + ppdb_writer: cHBkYl93cml0ZXI= + ppdb_writer_password: MW1za3FkemVsYThzCg== +kind: Secret +metadata: + name: tom-secrets + namespace: desc-tom-rknop-dev +type: Opaque diff --git a/spin_admin/rknop-dev/tom-send-alerts-cron.yaml b/spin_admin/rknop-dev/tom-send-alerts-cron.yaml new file mode 100644 index 00000000..72e080b6 --- /dev/null +++ b/spin_admin/rknop-dev/tom-send-alerts-cron.yaml @@ -0,0 +1,101 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: tom-send-alerts + namespace: desc-tom-rknop-dev +spec: + concurrencyPolicy: Forbid + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + metadata: + annotations: + field.cattle.io/ports: '[[]]' + nersc.gov/collab_uids: "70268" + nersc.gov/gid: "95089" + nersc.gov/gids: 45703,60152,57177,58102,59318,60070,63477,64483,79186,70268,92576,95089,96414 + nersc.gov/roles: user + nersc.gov/uid: "95089" + nersc.gov/username: raknop + creationTimestamp: null + spec: + containers: + - command: ['python'] + args: ['manage.py', 'send_elasticc2_alerts', + '-a', '36', + '-k', 'public.alerts.ztf.uw.edu:9092', + '--wfd-topic', 'elasticc2-2-wfd', + '--ddf-full-topic', 'elasticc2-2-ddf-full', + '--ddf-limited-topic', 'elasticc2-2-ddf-limited', + '--do'] + # args: ['manage.py', 'send_elasticc2_alerts', + # '-a', '3', + # '-k', 'kafka-server:9092', + # '--wfd-topic', 'alerts-wfd', + # '--ddf-full-topic', 'alerts-ddf-full', + # '--ddf-limited-topic', 'alerts-ddf-limited', + # '--do'] + env: + - name: DB_HOST + value: tom-postgres + - name: DB_NAME + value: tom_desc + - name: DB_PASS + value: fragile + - name: DB_USER + value: postgres + - name: CASSANDRA_HOST + value: tom-cassandra + - name: CASSANDRA_DB + value: tom_desc + - name: CASSANDRA_USER + value: cassandra + - name: CASSANDRA_PASSWORD + value: cassandra + image: registry.nersc.gov/m1727/raknop/tom_server_bindmount + imagePullPolicy: Always + name: tom-send-alerts + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + add: + - NET_BIND_SERVICE + drop: + - ALL + privileged: false + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 95089 + stdin: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tty: true + volumeMounts: + - mountPath: /secrets + name: tom-secrets + - mountPath: /tom_desc + name: tom-deployment + workingDir: /tom_desc + dnsPolicy: ClusterFirst + restartPolicy: Never + schedulerName: default-scheduler + securityContext: + fsGroup: 57177 + terminationGracePeriodSeconds: 30 + imagePullSecrets: + - name: registry-nersc + volumes: + - name: tom-secrets + secret: + defaultMode: 256 + optional: false + secretName: tom-secrets + - hostPath: + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/tom_desc/tom_desc + type: Directory + name: tom-deployment + schedule: 0 0 * * * + successfulJobsHistoryLimit: 3 + suspend: true diff --git a/spin_admin/rknop-dev/tom-rknop-dev-app-shell.yaml b/spin_admin/rknop-dev/tom-shell.yaml similarity index 51% rename from spin_admin/rknop-dev/tom-rknop-dev-app-shell.yaml rename to spin_admin/rknop-dev/tom-shell.yaml index 43473956..75d2219f 100644 --- a/spin_admin/rknop-dev/tom-rknop-dev-app-shell.yaml +++ b/spin_admin/rknop-dev/tom-shell.yaml @@ -2,17 +2,17 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: tom-rknop-dev-app-shell - namespace: desc-tom + name: tom-shell + namespace: desc-tom-rknop-dev labels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-app-shell + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-shell spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-app-shell + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-shell strategy: rollingUpdate: maxSurge: 1 @@ -21,21 +21,41 @@ spec: template: metadata: labels: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-app-shell + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-shell spec: containers: - env: - name: DB_HOST - value: tom-rknop-dev-postgres + value: tom-postgres - name: DB_NAME value: tom_desc - name: DB_PASS value: fragile - name: DB_USER value: postgres - image: registry.nersc.gov/m1727/raknop/tom_desc_bindmount:daedalus + - name: CASSANDRA_HOST + value: tom-cassandra + - name: CASSANDRA_DB + value: tom_desc + - name: CASSANDRA_USER + value: cassandra + - name: CASSANDRA_PASSWORD + value: cassandra + - name: FINK_GROUP_ID + value: lsstfr-johann + - name: FINK_SERVER + value: 134.158.74.95:24499, + - name: FINK_TOPIC + value: fink_early_sn_candidates_ztf + - name: FINK_USERNAME + value: johann + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secrets/pitt_google_auth_key.json + - name: GOOGLE_CLOUD_PROJECT + value: elasticc-challenge + image: registry.nersc.gov/m1727/raknop/tom_server_bindmount imagePullPolicy: Always - name: tom-rknop-dev-app + name: tom-shell resources: {} securityContext: allowPrivilegeEscalation: false @@ -54,13 +74,19 @@ spec: tty: true volumeMounts: - mountPath: /secrets - name: tom-rknop-dev-secrets + name: tom-secrets - mountPath: /tom_desc - name: tom-rknop-dev-deployment + name: tom-deployment + - mountPath: /code + name: tom-checkout - mountPath: /query_results - name: tom-rknop-dev-query-results -# - mountPath: /snana_data -# name: tom-rknop-dev-snana-data + name: tom-query-results + - mountPath: /sample + name: tom-sample + - mountPath: /pgdump + name: pgdump + # - mountPath: /pgdata + # name: tom-desc-postgres command: [ "tail" ] args: [ "-f", "/etc/issue" ] dnsConfig: {} @@ -73,37 +99,51 @@ spec: imagePullSecrets: - name: registry-nersc volumes: - - name: tom-rknop-dev-secrets + - name: tom-secrets secret: defaultMode: 256 optional: false - secretName: tom-rknop-dev-secrets + secretName: tom-secrets + - hostPath: + path: /global/cfs/cdirs/desc-td/SOFTWARE/tom_deployment/rknop_dev/tom_desc/tom_desc + type: Directory + name: tom-deployment + - hostPath: + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/tom_desc + type: Directory + name: tom-checkout + - hostPath: + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/query_results + type: Directory + name: tom-query-results - hostPath: - path: /global/cfs/cdirs/desc-td/SOFTWARE/tom_deployment/dev/tom_desc/tom_desc + path: /global/cfs/cdirs/desc-td/ELASTICC2 type: Directory - name: tom-rknop-dev-deployment + name: tom-sample - hostPath: - path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/dev/query_results + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/pg_dump type: Directory - name: tom-rknop-dev-query-results -# - hostPath: -# path: /global/cfs/cdirs/desc-td/ELASTICC_DATA -# type: Directory -# name: tom-rknop-dev-snana-data + name: pgdump + # - name: tom-desc-postgres + # persistentVolumeClaim: + # claimName: tom-desc-postgres --- apiVersion: v1 kind: Service metadata: - name: tom-rknop-dev-app-shell - namespace: desc-tom + name: tom-shell + namespace: desc-tom-rknop-dev spec: + clusterIP: None + clusterIPs: + - None ports: - name: default port: 42 protocol: TCP targetPort: 42 selector: - workload.user.cattle.io/workloadselector: deployment-desc-tom-tom-rknop-dev-app-shell + workload.user.cattle.io/workloadselector: deployment-desc-tom-rknop-dev-tom-shell sessionAffinity: None type: ClusterIP status: diff --git a/spin_admin/rknop-dev/tom-update-metrics-cron.yaml b/spin_admin/rknop-dev/tom-update-metrics-cron.yaml new file mode 100644 index 00000000..d87e8efa --- /dev/null +++ b/spin_admin/rknop-dev/tom-update-metrics-cron.yaml @@ -0,0 +1,93 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: tom-update-metrics + namespace: desc-tom-rknop-dev +spec: + concurrencyPolicy: Forbid + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + metadata: + annotations: + field.cattle.io/ports: '[[]]' + nersc.gov/collab_uids: "70268" + nersc.gov/gid: "95089" + nersc.gov/gids: 45703,60152,57177,58102,59318,60070,63477,64483,79186,70268,92576,95089,96414 + nersc.gov/roles: user + nersc.gov/uid: "95089" + nersc.gov/username: raknop + creationTimestamp: null + spec: + containers: + - command: [ '/bin/bash', '-c' ] + args: + - | + python manage.py gen_elasticc2_brokerdelaygraphs_pg --t0 2023-11-11 + python manage.py gen_confmatrix_last + python manage.py gen_elasticc2_brokercompleteness --t0 2023-11-11 +# python manage.py update_elasticc2_alerts_sent --start 2023-11-11 --endrel 0 + env: + - name: DB_HOST + value: tom-postgres + - name: DB_NAME + value: tom_desc + - name: DB_PASS + value: fragile + - name: DB_USER + value: postgres + - name: CASSANDRA_HOST + value: tom-cassandra + - name: CASSANDRA_DB + value: tom_desc + - name: CASSANDRA_USER + value: cassandra + - name: CASSANDRA_PASSWORD + value: cassandra + image: registry.nersc.gov/m1727/raknop/tom_server_bindmount + imagePullPolicy: Always + name: tom-update-metrics + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + add: + - NET_BIND_SERVICE + drop: + - ALL + privileged: false + readOnlyRootFilesystem: false + runAsNonRoot: true + runAsUser: 95089 + stdin: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tty: true + volumeMounts: + - mountPath: /secrets + name: tom-secrets + - mountPath: /tom_desc + name: tom-deployment + workingDir: /tom_desc + dnsPolicy: ClusterFirst + restartPolicy: Never + schedulerName: default-scheduler + securityContext: + fsGroup: 57177 + terminationGracePeriodSeconds: 30 + imagePullSecrets: + - name: registry-nersc + volumes: + - name: tom-secrets + secret: + defaultMode: 256 + optional: false + secretName: tom-secrets + - hostPath: + path: /global/cfs/cdirs/lsst/groups/TD/SOFTWARE/tom_deployment/rknop_dev/tom_desc/tom_desc + type: Directory + name: tom-deployment + schedule: 0 12 * * * + successfulJobsHistoryLimit: 3 + suspend: true From a4bd5e17d126e368acb9dcce84770b3a0784367a Mon Sep 17 00:00:00 2001 From: "Rob Knop (Nersc)" Date: Wed, 30 Oct 2024 14:02:14 -0700 Subject: [PATCH 2/5] *facepalm* --- spin_admin/rknop-dev/tom-secrets.yaml | 40 +++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/spin_admin/rknop-dev/tom-secrets.yaml b/spin_admin/rknop-dev/tom-secrets.yaml index 9903809a..c9270244 100644 --- a/spin_admin/rknop-dev/tom-secrets.yaml +++ b/spin_admin/rknop-dev/tom-secrets.yaml @@ -1,25 +1,25 @@ apiVersion: v1 data: - pitt_google_auth_key.json: ewogICJ0eXBlIjogInNlcnZpY2VfYWNjb3VudCIsCiAgInByb2plY3RfaWQiOiAiZWxhc3RpY2MtY2hhbGxlbmdlIiwKICAicHJpdmF0ZV9rZXlfaWQiOiAiNGM1YjM4MmQxNWQ0YzE1ZWI5ZDU0MDc4NWE5Yjk4ZTU2MzcxMmQxOCIsCiAgInByaXZhdGVfa2V5IjogIi0tLS0tQkVHSU4gUFJJVkFURSBLRVktLS0tLVxuTUlJRXZnSUJBREFOQmdrcWhraUc5dzBCQVFFRkFBU0NCS2d3Z2dTa0FnRUFBb0lCQVFEVTZaYXQ5ZDhmNzVGU1xuS2g3RGtmSktQdFFINGVVYU03dU5RaHcyeXdRM0VMcUdIWDB1OVlYVDdnK3VrbzZSeGMzdU9vbU5ldkljTnBGblxubnBZNi9meHFiR0pUSjA4UXBxM01NTzJpZk13ZDdNVS9Tc2tQeHNYcG9nS2UvczVySVJRY2dNSE9sdUxiRlEzSlxuRWxKMGJsYytBMWlZczFJd1dZVm9UYnd2WkhQK0IxenF5Uk1ndnpwb3pkRG1UaC92Tk9QcU0wNVJlUHl2VjNDVlxuOW1XNWtyTFVuZVA2RGFkZjBuNk9tRVh4WVBiaFpTQ2dwMEpVbDkxdUhtYjlHaU9tQzBtNDVkYmtlWGJjMjdqQ1xuTlAwWWorYTREVzA3aUlyV1JVZWVhc2luK0loZWlsQ0NETDl1dmpGbXZWS3NRT0hKREgrUkVrb3Q3TW43clRFT1xuK3BEbkYwUXRBZ01CQUFFQ2dnRUFMUDE1SmlSYzlEb2kycHBrNnMyVnhjT1YrVjV1RTRYWC9XYjVZRWl4Q05QRFxubm1CdjJmTHV4eFE0Z3hxYi9zaXVGY2VQMW5JaTQveFI2SFAxNmR4T3NNeGJVSmZ2V0tUbjk4eGxBcDlkWUh0dVxuU013TVJZV3F2NDVKQUlEMmJHUE9oS0Zla3NHR2lXWGZCbHlWU040aUdFd0x0aVM2V3RZbjN4MENCWCswRlBybFxuWmhPTDZmTk94Zk53UTM4YjdWeWxNUXM0MW1ZUEpNM2dPVHE3UGpjRytCZEkvNU85YitDZ2tHN3RqKzM5S25qaFxuSUlLWGpoOVRlSmRRcFNjV2lyVmE0Yi92YmJ1QkJtc2g4WDFyR2lvV2I1QTJQY0tJQWl3bUFWeHlKTnFBU2N2TlxuL2RIZFdtb1krazFONDkwc09pNVpGSW5DMGhKS2tSbHJ2RDJXMnBCL1lRS0JnUUQ2K2x3S2dBYmhkenRTY1RiR1xuZi9ZZGd4M1h3MHA3a0NueTRWSzJPVHNxRGVMeDJlWUw1UDY1UElRcnU5d2FmS3lwOXBGVzlUTWt4eXdZYkFUcFxuOVZUVzdKS0dOM2xWdDBkUm9WcWxwOXFpUVdHaUtaYnVCaHNwK0hvL2QzS3NURTJaMXdxdGRKcGhyM3Q2bHdrQlxuZVYyUm0ycWF4U3RJVTVFdkpFZ051KzJGMFFLQmdRRFpMRHpTc2hlMkdRdHUzYWlDbWxXTWFNdWRYUy9qMlBXTlxuZU5hOEJzUHViV0pHOS9HY05UTU9nNHZxK3dkenAyTjdrSzZoQ1pXeWxNS3U1K1dlOVlKa0VDVFI5a25kU1hualxuT0pBOVhWVGN3OUdTajBXdVNhUUpDNitrUE9mWm0yMXZ5cE0rc3IreHpHc045TjMxdGhNVW5ZdXNNaDZCZVUyL1xuR05kTUd4N0RuUUtCZ0ZxekIzZ3o5RWllMGkxbEJLcVBxOTNXZUg1MDhRSGVnNHBtcTRGM0JVNlpYaDZRNUhYL1xuOTUzeHBNSTVUYjIwdUtGam5mcVMranljVGU4MWVrRWlpOXB2L0RQTThHaHRwN1IwWDFIbWF6aWY4Q3l1WXdENVxuNis4ckFPTFd3L1pJVFBML3A3Z3ZuMnNXQThXMXQyZDArTCthTHRobjBOQitnc1ZLazArRFRyOUJBb0dCQU1OYVxuRDd4WGh3eStxcElud1pUNXowcG1XZ0pMK1BBMEVXNFk3TkJtSllrcnkrT2ExVE4yaDE3ZTVkV1J5Ym8zTUQzT1xuZjhkTUgrUnIvRlBwTlp3dnJBTG1vcmhibTBYNitqRnlpQTByRDFNbXNqWHJ1dnAwTTZJUGFRZUw0ZUxVOVRSQ1xuUFFHQk9QNXVxejM2cms2QVlkbUJma3AvS3dHOFpEc3AxVUVZaEN2aEFvR0JBTWRYN2FHei94WXhYdFl1SVZsY1xua25Kd1dHZGRsdzhHMUlWOHcycmozZFJha2NRNCthOFg4T2c1Z045UlRGdDc4V2VVcmVBRGUrWFRwSGpoU1BWTFxuN0NudEoxZWkzcURiYm9CZFZGTUV4akROQ3BFL3JaMmdVNmpBQWRmS2pKRi9FTmd6aW5yM3kwWjdZcy9tUUtLT1xuV1JzSlNQWEIwd204UWJSR0YxT3FXMXdpXG4tLS0tLUVORCBQUklWQVRFIEtFWS0tLS0tXG4iLAogICJjbGllbnRfZW1haWwiOiAiZGVzYy10b21AZWxhc3RpY2MtY2hhbGxlbmdlLmlhbS5nc2VydmljZWFjY291bnQuY29tIiwKICAiY2xpZW50X2lkIjogIjEwNTk3ODQyOTA3NDg4MDIxMzI1OCIsCiAgImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKICAidG9rZW5fdXJpIjogImh0dHBzOi8vb2F1dGgyLmdvb2dsZWFwaXMuY29tL3Rva2VuIiwKICAiYXV0aF9wcm92aWRlcl94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL29hdXRoMi92MS9jZXJ0cyIsCiAgImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvZGVzYy10b20lNDBlbGFzdGljYy1jaGFsbGVuZ2UuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAogICJ1bml2ZXJzZV9kb21haW4iOiAiZ29vZ2xlYXBpcy5jb20iCn0K - alerce_passwd: ZWxhc3RpY2N2Mgo= - alerce_username: ZWxhc3RpY2M= - antares_passwd: Vmp4YF5uQX5RUVNORWg1NmVeTVo0amAjeHp5dVVpJiM3UVBoeXNHNG9OYWJ2fjRvTSM3UkAjd2lNMlNoMjNLNQ== - antares_username: Uk1XTUJHUEtIV0hGTUdVVQ== - django_secret_key: TE9SN1hUdGh2MjNRZmFIYmlkUHVBREdFQmVKTXBsbGk= - postgres_elasticc_admin_ro_password: bmZuaWhuYWs4dThq - postgres_elasticc_ro_password: MjB2dDVkdHVhYTBv - postgres_password: ZnJhZ2lsZQ== - postgres_ro_password: aDI2cmowMHY0b3Ax - mongodb_admin: bW9uZ29kYl9hZG1pbg== - mongodb_admin_passwd: eWp0Zmw3eWRvcnA4Cg== - mongodb_alert_reader: bW9uZ29kYl9hbGVydF9yZWFkZXI= - mongodb_alert_reader_password: aXdkeTY2bHBpOTc3Cg== - mongodb_alert_writer: bW9uZ29kYl9hbGVydF93cml0ZXI= - mongodb_alert_writer_password: bnkxNGl0NjE0emo3Cg== - ppdb_reader: cHBkYl9yZWFkZXI= - ppdb_reader_password: YzRzMzhjdHNwd2ltCg== - ppdb_writer: cHBkYl93cml0ZXI= - ppdb_writer_password: MW1za3FkemVsYThzCg== + pitt_google_auth_key.json: PUT THE RIGHT THING HERE + alerce_passwd: PUT THE RIGHT THING HERE + alerce_username: PUT THE RIGHT THING HERE + antares_passwd: PUT THE RIGHT THING HERE + antares_username: PUT THE RIGHT THING HERE + django_secret_key: PUT THE RIGHT THING HERE + postgres_elasticc_admin_ro_password: PUT THE RIGHT THING HERE + postgres_elasticc_ro_password: PUT THE RIGHT THING HERE + postgres_password: PUT THE RIGHT THING HERE + postgres_ro_password: PUT THE RIGHT THING HERE + mongodb_admin: PUT THE RIGHT THING HERE + mongodb_admin_passwd: PUT THE RIGHT THING HERE + mongodb_alert_reader: PUT THE RIGHT THING HERE + mongodb_alert_reader_password: PUT THE RIGHT THING HERE + mongodb_alert_writer: PUT THE RIGHT THING HERE + mongodb_alert_writer_password: PUT THE RIGHT THING HERE + ppdb_reader: PUT THE RIGHT THING HERE + ppdb_reader_password: PUT THE RIGHT THING HERE + ppdb_writer: PUT THE RIGHT THING HERE + ppdb_writer_password: PUT THE RIGHT THING HERE kind: Secret metadata: name: tom-secrets From 6cc29960ef44b463b5946722028cbf747b970433 Mon Sep 17 00:00:00 2001 From: "Rob Knop (Nersc)" Date: Wed, 30 Oct 2024 15:32:39 -0700 Subject: [PATCH 3/5] yaml file updates --- spin_admin/rknop-dev/tom-app.yaml | 26 +++++++++++++------------- spin_admin/rknop-dev/tom-mongodb.yaml | 9 ++++----- spin_admin/tom-mongodb.yaml | 7 +++---- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/spin_admin/rknop-dev/tom-app.yaml b/spin_admin/rknop-dev/tom-app.yaml index b8fa18d2..0e042191 100644 --- a/spin_admin/rknop-dev/tom-app.yaml +++ b/spin_admin/rknop-dev/tom-app.yaml @@ -198,17 +198,17 @@ spec: port: number: 8080 pathType: ImplementationSpecific -# - host: desc-tom-rknop-dev.lbl.gov -# http: -# paths: -# - backend: -# service: -# name: tom-ingress-service -# port: -# number: 8080 -# pathType: ImplementationSpecific -# tls: -# - hosts: -# - desc-tom-rknop-dev.lbl.gov -# secretName: tom-cert + - host: desc-tom-rknop-dev.lbl.gov + http: + paths: + - backend: + service: + name: tom-ingress-service + port: + number: 8080 + pathType: ImplementationSpecific + tls: + - hosts: + - desc-tom-rknop-dev.lbl.gov + secretName: tom-cert --- diff --git a/spin_admin/rknop-dev/tom-mongodb.yaml b/spin_admin/rknop-dev/tom-mongodb.yaml index 6f09f225..2eee32d1 100644 --- a/spin_admin/rknop-dev/tom-mongodb.yaml +++ b/spin_admin/rknop-dev/tom-mongodb.yaml @@ -105,10 +105,6 @@ spec: terminationMessagePath: /dev/termination-log terminationMessagePolicy: File tty: true -# ports: -# - name: mongodb -# containerPort: 27017 -# protocol: TCP volumeMounts: - mountPath: /mongodb-data name: mongodb-datadir @@ -127,9 +123,12 @@ spec: apiVersion: v1 kind: Service metadata: - name: mongodb-service + name: mongodb namespace: desc-tom-rknop-dev spec: + clusterIP: None + clusterIPs: + - None ports: - name: default port: 42 diff --git a/spin_admin/tom-mongodb.yaml b/spin_admin/tom-mongodb.yaml index 3905740f..cc641b9f 100644 --- a/spin_admin/tom-mongodb.yaml +++ b/spin_admin/tom-mongodb.yaml @@ -105,10 +105,6 @@ spec: terminationMessagePath: /dev/termination-log terminationMessagePolicy: File tty: true -# ports: -# - name: mongodb -# containerPort: 27017 -# protocol: TCP volumeMounts: - mountPath: /mongodb-data name: mongodb-datadir @@ -130,6 +126,9 @@ metadata: name: mongodb-service namespace: desc-tom spec: + clusterIP: None + clusterIPs: + - None ports: - name: default port: 42 From 8c2b6505a4feff3ac31dcd05a61a60a778d01bd5 Mon Sep 17 00:00:00 2001 From: Rob Knop Date: Thu, 31 Oct 2024 10:52:06 -0700 Subject: [PATCH 4/5] In the middle of some refactoring to reduce shared code between elasticc2 and fastdb_dev. test_elasticc2_alert_cycle passes --- .gitignore | 3 +- docker_mongodb/Dockerfile | 1 + fastdb_get_alerts/process_alerts.py | 675 ------------------ logs/.placeholder | 1 + tests/docker-compose.yaml | 122 ++-- .../db/management/commands/_brokerconsumer.py | 488 +++++++++++++ .../management/commands/brokerpoll2.py | 451 +----------- .../commands/send_elasticc2_alerts.py | 15 +- tom_desc/fastdb_dev/DataTools.py | 3 +- .../commands/fastdb_dev_brokerpoll.py} | 125 ++-- .../management/commands/load_fastdb.py | 6 +- 11 files changed, 673 insertions(+), 1217 deletions(-) delete mode 100644 fastdb_get_alerts/process_alerts.py create mode 100644 logs/.placeholder create mode 100644 tom_desc/db/management/commands/_brokerconsumer.py rename tom_desc/{db/management/commands/create_cassandra_db.py => fastdb_dev/management/commands/fastdb_dev_brokerpoll.py} (61%) diff --git a/.gitignore b/.gitignore index 1392f54a..b9cf019c 100644 --- a/.gitignore +++ b/.gitignore @@ -141,7 +141,8 @@ tom_desc/elasticc2/static/elasticc2/alertsendfinish tom_desc/elasticc2/static/elasticc2/alertsend # Log directory -logs/ +logs/** +!logs/.placeholder # FastDB files tom_desc/fastdb_dev/management/commands/insert_last_time.sql diff --git a/docker_mongodb/Dockerfile b/docker_mongodb/Dockerfile index cf04348d..d89e30af 100644 --- a/docker_mongodb/Dockerfile +++ b/docker_mongodb/Dockerfile @@ -14,6 +14,7 @@ RUN set -eux; \ jq \ numactl \ procps \ + netcat-openbsd \ ; \ rm -rf /var/lib/apt/lists/* diff --git a/fastdb_get_alerts/process_alerts.py b/fastdb_get_alerts/process_alerts.py deleted file mode 100644 index 5e65b65a..00000000 --- a/fastdb_get_alerts/process_alerts.py +++ /dev/null @@ -1,675 +0,0 @@ -from pymongo import MongoClient -import sys -import pathlib -import logging -import fastavro -import json -import multiprocessing -import signal -import time -import confluent_kafka -import io -import os -import re -import traceback -import datetime -import collections -import atexit -import argparse -import urllib - -_rundir = pathlib.Path(__file__).parent -_logdir = pathlib.Path(__file__).parent.parent -print(_rundir) - -_logger = logging.getLogger(__name__) -if not _logger.hasHandlers(): - _logout = logging.FileHandler( _logdir / f"logs/broker.log" ) - _logger.addHandler( _logout ) - _formatter = logging.Formatter( f'[msgconsumer - %(asctime)s - %(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) - _logout.setFormatter( _formatter ) -_logger.setLevel( logging.DEBUG ) - -def _donothing( *args, **kwargs ): - pass - -def close_msg_consumer( obj ): - obj.close() - - -class MsgConsumer(object): - def __init__( self, server, groupid, schema, topics=None, extraconsumerconfig=None,consume_nmsgs=10, consume_timeout=5, nomsg_sleeptime=1, logger=_logger, username=None, password=None ): - """Wraps a confluent_kafka.Consumer. - - server : the bootstrap.servers value - groupid : the group.id value - schema : filename where the schema of messages to be consumed can be found - topics : topic name, or list of topic names, to subscribe to - extraconsumerconfig : (optional) additional consumer config (dict) - consume_nmsgs : number of messages to pull from the server at once (default 10) - consume_timeout : timeout after waiting on the server for this many seconds - nomsg_sleeptime : sleep for this many seconds after a consume_timeout before trying again - logger : a logging object - - """ - - self.consumer = None - self.logger = logger - self.tot_handled = 0 - - self.schema = fastavro.schema.load_schema( schema ) - self.consume_nmsgs = consume_nmsgs - self.consume_timeout = consume_timeout - self.nomsg_sleeptime = nomsg_sleeptime - self.username = username - self.password = password - - consumerconfig = { "bootstrap.servers": server, - "auto.offset.reset": "earliest", - "group.id": groupid, - } - - if extraconsumerconfig is not None: - consumerconfig.update( extraconsumerconfig ) - - self.logger.debug( f'Initializing Kafka consumer with\n{json.dumps(consumerconfig, indent=4)}' ) - self.consumer = confluent_kafka.Consumer( consumerconfig ) - atexit.register( close_msg_consumer, self ) - - self.subscribed = False - self.subscribe( topics ) - - def close( self ): - if self.consumer is not None: - self.logger.info( "Closing MsgConsumer" ) - self.consumer.close() - self.consumer = None - - def subscribe( self, topics ): - if topics is None: - self.topics = [] - elif isinstance( topics, str ): - self.topics = [ topics ] - elif isinstance( topics, collections.abc.Sequence ): - self.topics = list( topics ) - else: - raise ValueError( f'topics must be either a string or a list' ) - - servertopics = self.get_topics() - subtopics = [] - for topic in self.topics: - if topic not in servertopics: - self.logger.warning( f'Topic {topic} not on server, not subscribing' ) - else: - subtopics.append( topic ) - self.topics = subtopics - - #for topic in self.topics: - # st = [i for i in servertopics if topic in i] - # if len(st) !=0: - # for t in st: - # subtopics.append(t) - # else: - # self.logger.warning( f'Topic {topic} not on server, not subscribing' ) - #self.topics = subtopics - - if self.topics is not None and len(self.topics) > 0: - self.logger.info( f'Subscribing to topics: {", ".join( self.topics )}' ) - self.consumer.subscribe( self.topics, on_assign=self._sub_callback ) - else: - self.logger.warning( f'No existing topics given, not subscribing.' ) - - def get_topics( self ): - cluster_meta = self.consumer.list_topics() - return [ n for n in cluster_meta.topics ] - - def print_topics( self, newlines=False ): - topics = self.get_topics() - if not newlines: - self.logger.info( f"\nTopics: {', '.join(topics)}" ) - else: - topicstr = '\n '.join( topics ) - self.logger.info( f"\nTopics:\n {topicstr}" ) - - def _get_positions( self, partitions ): - return self.consumer.position( partitions ) - - def _dump_assignments( self, ofp, partitions ): - ofp.write( f'{"Topic":<32s} {"partition":>9s} {"offset":>12s}\n' ) - for par in partitions: - ofp.write( f"{par.topic:32s} {par.partition:9d} {par.offset:12d}\n" ) - ofp.write( "\n" ) - - def print_assignments( self ): - asmgt = self._get_positions( self.consumer.assignment() ) - ofp = io.StringIO() - ofp.write( "Current partition assignments\n" ) - self._dump_assignments( ofp, asmgt ) - self.logger.info( ofp.getvalue() ) - ofp.close() - - def _sub_callback( self, consumer, partitions ): - self.subscribed = True - ofp = io.StringIO() - ofp.write( "Consumer subscribed. Assigned partitions:\n" ) - self._dump_assignments( ofp, self._get_positions( partitions ) ) - self.logger.info( ofp.getvalue() ) - ofp.close() - - def reset_to_start( self, topic ): - partitions = self.consumer.list_topics( topic ).topics[topic].partitions - self.logger.info( f'Resetting partitions for topic {topic}' ) - # partitions is a map - partlist = [] - # Must consume one message to really hook up to the topic - self.consume_one_message( handler=_donothing, timeout=10 ) - for i in range(len(partitions)): - self.logger.info( f'...resetting partition {i}' ) - curpart = confluent_kafka.TopicPartition( topic, i ) - lowmark, highmark = self.consumer.get_watermark_offsets( curpart ) - self.logger.debug( f'Partition {curpart.topic} has id {curpart.partition} ' - f'and current offset {curpart.offset}; lowmark={lowmark} ' - f'and highmark={highmark}' ) - curpart.offset = lowmark - if lowmark < highmark: - self.consumer.seek( curpart ) - partlist.append( curpart ) - self.logger.info( f'Committing partition offsets.' ) - self.consumer.commit( offsets=partlist ) - self.tot_handled = 0 - - def consume_one_message( self, timeout=None, handler=None ): - """Both calls handler and returns a batch of 1 message.""" - timeout = self.consume_timeout if timeout is None else timeout - self.logger.info( f"Trying to consume one message with timeout {timeout}...\n" ) - msgs = self.consumer.consume( 1, timeout=timeout ) - if len(msgs) == 0: - return None - else: - self.tot_handled += len(msgs) - if handler is not None: - handler( msgs ) - else: - self.default_handle_message_batch( msgs ) - - def default_handle_message_batch( self, msgs ): - self.logger.info( f'Got {len(msgs)}; have received {self._tot_handled} so far.' ) - - def echoing_handle_message_batch( self, msgs ): - self.logger.info( f'Handling {len(msgs)} messages' ) - for msg in msgs: - ofp = io.StringIO( f"Topic: {msg.topic()} ; Partition: {msg.partition()} ; " - f"Offset: {msg.offset()} ; Key: {msg.key()}\n" ) - alert = fastavro.schemaless_reader( io.BytesIO(msg.value()), self.schema ) - ofp.write( json.dumps( alert, indent=4, sort_keys=True ) ) - ofp.write( "\n" ) - self.logger.info( ofp.getvalue() ) - ofp.close() - self.logger.info( f'Have handled {self.tot_handled} messages so far' ) - - def poll_loop( self, handler=None, max_consumed=None, pipe=None, max_runtime=datetime.timedelta(hours=1) ): - """Calls handler with batches of messages. - - handler : a callback that's called with batches of messages (the list - returned by confluent_kafka.Consumer.consume(). - max_consumed : Quit polling after this many messages have been - consumed (default: no limit) - pipe : A pipe to send regular heartbeats to, and to listen for "die" messages from. - max_runtime : Quit polling after this much time has elapsed; - must be a datetime.timedelta object. (Default: 1h.) - - returns True if consumed ?max_consumed or timed out, False if died due to die command - """ - nconsumed = 0 - starttime = datetime.datetime.now() - keepgoing = True - retval = True - while keepgoing: - self.logger.debug( f"Trying to consume {self.consume_nmsgs} messages " - f"with timeout {self.consume_timeout}..." ) - msgs = self.consumer.consume( self.consume_nmsgs, timeout=self.consume_timeout ) - if len(msgs) == 0: - self.logger.debug( f"No messages, sleeping {self.nomsg_sleeptime} sec" ) - time.sleep( self.nomsg_sleeptime ) - else: - self.logger.debug( f"...got {len(msgs)} messages" ) - self.tot_handled += len(msgs) - if handler is not None: - handler( msgs ) - else: - self.default_handle_message_batch( msgs ) - nconsumed += len( msgs ) - runtime = datetime.datetime.now() - starttime - if ( ( ( max_consumed is not None ) and ( nconsumed >= max_consumed ) ) - or - ( ( max_runtime is not None ) and ( runtime > max_runtime ) ) ): - keepgoing = False - if pipe is not None: - pipe.send( { "message": "ok", "nconsumed": nconsumed, "runtime": runtime } ) - if pipe.poll(): - msg = pipe.recv() - if ( 'command' in msg ) and ( msg['command'] == 'die' ): - self.logger.info( "Exiting poll loop due to die command." ) - retval = False - keepgoing = False - else: - self.logger.error( f"Received unknown message from pipe, ignoring: {msg}" ) - - self.logger.info( f"Stopping poll loop after consuming {nconsumed} messages during {runtime}" ) - return retval - - -class BrokerConsumer: - def __init__( self, server, groupid, topics=None, updatetopics=False, - schemaless=True, reset=False, extraconfig={}, collection=None, - schemafile=None, pipe=None, loggername="BROKER", username=None, password=None, **kwargs ): - - self.logger = logging.getLogger( loggername ) - self.logger.propagate = False - logout = logging.FileHandler( _logdir / f"logs/broker.log" ) - self.logger.addHandler( logout ) - formatter = logging.Formatter( f'[%(asctime)s - {loggername} - %(levelname)s] - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' ) - logout.setFormatter( formatter ) - self.logger.setLevel( logging.DEBUG ) - - if schemafile is None: - schemafile = _rundir / "elasticc.v0_9_1.brokerClassification.avsc" - - self.countlogger = logging.getLogger( f"countlogger_{loggername}" ) - self.countlogger.propagate = False - _countlogout = logging.FileHandler( _logdir / f"logs/brokerpoll_counts_{loggername}.log" ) - _countformatter = logging.Formatter( f'[%(asctime)s - %(levelname)s] - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' ) - _countlogout.setFormatter( _countformatter ) - self.countlogger.addHandler( _countlogout ) - self.countlogger.setLevel( logging.DEBUG ) - - self.countlogger.info( f"************ Starting Brokerconsumer for {loggername} ****************" ) - - self.pipe = pipe - self.server = server - self.groupid = groupid - self.topics = topics - self._updatetopics = updatetopics - self._reset = reset - self.extraconfig = extraconfig - self.username = username - self.password = password - - self.schemaless = schemaless - if not self.schemaless: - self.countlogger.error( "CRASHING. I only know how to handle schemaless streams." ) - raise RuntimeError( "I only know how to handle schemaless streams" ) - self.schemafile = schemafile - self.schema = fastavro.schema.load_schema( self.schemafile ) - - self.nmessagesconsumed = 0 - - self.countlogger.info( f"************ Connecting to MongoDB {loggername} ****************" ) - mongo_username = urllib.parse.quote_plus(os.environ['MONGODB_ALERT_WRITER']) - mongo_password = urllib.parse.quote_plus(os.environ['MONGODB_ALERT_WRITER_PASSWORD']) - - client = MongoClient("mongodb://%s:%s@fastdbdev-mongodb:27017/?authSource=alerts" %(mongo_username,mongo_password) ) - self.db = client.alerts - self.collection = self.db[collection] - self.countlogger.info(self.db) - self.countlogger.info(self.collection) - - @property - def reset( self ): - return self._reset - - @reset.setter - def reset( self, val ): - self._reset = val - - def create_connection( self ): - countdown = 5 - while countdown >= 0: - try: - self.consumer = MsgConsumer( self.server, self.groupid, self.schemafile, self.topics, extraconsumerconfig=self.extraconfig, consume_nmsgs=1000, consume_timeout=1, nomsg_sleeptime=5, logger=self.logger, username=self.username, password=self.password ) - countdown = -1 - except Exception as e: - countdown -= 1 - strio = io.StringIO("") - strio.write( f"Exception connecting to broker: {str(e)}" ) - traceback.print_exc( file=strio ) - self.logger.warning( strio.getvalue() ) - if countdown >= 0: - self.logger.warning( "Sleeping 5s and trying again." ) - time.sleep(5) - else: - self.logger.error( "Repeated exceptions connecting to broker, punting." ) - self.countlogger.error( "Repeated exceptions connecting to broker, punting." ) - raise RuntimeError( "Failed to connect to broker" ) - - if self._reset and ( self.topics is not None ): - self.countlogger.info( f"*************** Resetting to start of broker kafka stream ***************" ) - self.reset_to_start() - # Only want to reset the first time the connection is opened! - self._reset = False - - self.countlogger.info( f"**************** Consumer connection opened *****************" ) - - def close_connection( self ): - self.countlogger.info( f"**************** Closing consumer connection ******************" ) - self.consumer.close() - self.consumer = None - - - def reset_to_start( self ): - self.logger.info( "Resetting all topics to start" ) - for topic in self.topics: - self.consumer.reset_to_start( topic ) - - def handle_message_batch( self, msgs ): - messagebatch = [] - self.countlogger.info( f"Handling {len(msgs)} messages; consumer has received " - f"{self.consumer.tot_handled} messages." ) - for msg in msgs: - timestamptype, timestamp = msg.timestamp() - - - if timestamptype == confluent_kafka.TIMESTAMP_NOT_AVAILABLE: - timestamp = None - else: - timestamp = datetime.datetime.fromtimestamp( timestamp / 1000, tz=datetime.timezone.utc ) - - payload = msg.value() - if not self.schemaless: - self.countlogger.error( "I only know how to handle schemaless streams" ) - raise RuntimeError( "I only know how to handle schemaless streams" ) - alert = fastavro.schemaless_reader( io.BytesIO( payload ), self.schema ) - messagebatch.append( { 'topic': msg.topic(), - 'msgoffset': msg.offset(), - 'timestamp': timestamp, - 'msg': alert } ) - added = self.store( messages = messagebatch ) - #self.countlogger.info( f"...added {added['addedmsgs']} messages, " -# f"{added['addedclassifiers']} classifiers, " -# f"{added['addedclassifications']} classifications#. " ) - - def poll( self, restart_time=datetime.timedelta(minutes=30) ): - self.create_connection() - while True: - if self._updatetopics: - self.update_topics() - strio = io.StringIO("") - if len(self.consumer.topics) == 0: - self.logger.info( "No topics, will wait 10s and reconnect." ) - time.sleep(10) - else: - self.logger.info( f"Subscribed to topics: {self.consumer.topics}; starting poll loop." ) - self.countlogger.info( f"Subscribed to topics: {self.consumer.topics}; starting poll loop." ) - try: - happy = self.consumer.poll_loop( handler=self.handle_message_batch, - max_consumed=None, max_runtime=restart_time, - pipe=self.pipe ) - if happy: - strio.write( f"Reached poll timeout for {self.server}; " - f"handled {self.consumer.tot_handled} messages. " ) - else: - strio.write( f"Poll loop received die command after handling " - f"{self.consumer.tot_handled} messages. Exiting." ) - self.logger.info( strio.getvalue() ) - self.countlogger.info( strio.getvalue() ) - self.close_connection() - return - except Exception as e: - otherstrio = io.StringIO("") - traceback.print_exc( file=otherstrio ) - self.logger.warning( otherstrio.getvalue() ) - strio.write( f"Exception polling: {str(e)}. " ) - - if self.pipe.poll(): - msg = self.pipe.recv() - if ( 'command' in msg ) and ( msg['command'] == 'die' ): - self.logger.info( "No topics, but also exiting broker poll due to die command." ) - self.countlogger.info( "No topics, but also existing broker poll due to die command." ) - self.close_connection() - return - strio.write( "Reconnecting.\n" ) - self.logger.info( strio.getvalue() ) - self.countlogger.info( strio.getvalue() ) - self.close_connection() - if self._updatetopics: - self.topics = None - self.create_connection() - - - def store(self, messages = None): - - messagebatch = messages - results = self.collection.insert_many(messagebatch) - count = len(results.inserted_ids) - self.logger.info(f"Inserted {count} messages") - - return - - -class AlerceConsumer(BrokerConsumer): - def __init__( self, - grouptag=None, - usernamefile='/secrets/alerce_username', - passwdfile='/secrets/alerce_passwd', - loggername="ALERCE", - early_offset=os.getenv( "ALERCE_TOPIC_RELDATEOFFSET", -4 ), - alerce_topic_pattern='^lc_classifier_.*_(\d{4}\d{2}\d{2})$', - **kwargs ): - server = os.getenv( "ALERCE_KAFKA_SERVER", "kafka.alerce.science:9093" ) - groupid = "elasticc-lbnl" + ( "" if grouptag is None else "-" + grouptag ) - self.early_offset = int( early_offset ) - self.alerce_topic_pattern = alerce_topic_pattern - topics = None - updatetopics = True - with open( usernamefile ) as ifp: - username = ifp.readline().strip() - with open( passwdfile ) as ifp: - passwd = ifp.readline().strip() - extraconfig = { "security.protocol": "SASL_SSL", - "sasl.mechanism": "SCRAM-SHA-512", - "sasl.username": username, - "sasl.password": passwd } - - collection = 'alerce' - - super().__init__( server, groupid, topics=topics, updatetopics=updatetopics, extraconfig=extraconfig, colection=collection, loggername=loggername, username=username, password=password, **kwargs ) - self.logger.info( f"ALERCE group id is {groupid}" ) - - self.badtopics = [ 'lc_classifier_balto_20230807' ] - - def update_topics( self, *args, **kwargs ): - now = datetime.datetime.now() - datestrs = [] - for ddays in range(self.early_offset, 3): - then = now + datetime.timedelta( days=ddays ) - datestrs.append( f"{then.year:04d}{then.month:02d}{then.day:02d}" ) - tosub = [] - topics = self.consumer.get_topics() - for topic in topics: - match = re.search( '^lc_classifier_.*_(\d{4}\d{2}\d{2})$', topic ) - if match and ( match.group(1) in datestrs ) and ( topic not in self.badtopics ): - tosub.append( topic ) - self.topics = tosub - self.consumer.subscribe( self.topics ) - -class TestConsumer(BrokerConsumer): - def __init__( self, grouptag=None, test_topic=None, loggername="TEST", **kwargs ): - - collection = 'test' - - if test_topic is None: - raise RuntimeError( "Must specify test topic" ) - server = "kafka-server:9092" - groupid = "testing" + ("" if grouptag is None else "-" + grouptag ) - topics = [ test_topic ] - super().__init__( server, groupid, topics=topics, loggername=loggername, collection=collection, - **kwargs ) - self.logger.info( f"Test group id is {groupid}, topic is {test_topic}" ) - -class Broker(object): - - def __init__( self, username=None, password=None, *args, **options ): - - self.logger = logging.getLogger( "brokerpoll_baselogger" ) - self.logger.propagate = False - logout = logging.FileHandler( _logdir / f"logs/brokerpoll.log" ) - self.logger.addHandler( logout ) - formatter = logging.Formatter( f'[%(asctime)s - brokerpoll - %(levelname)s] - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' ) - logout.setFormatter( formatter ) - self.logger.setLevel( logging.DEBUG ) - if options['reset']: - self.reset = options['reset'] - - self.username = username - self.password = password - - - def sigterm( self, sig="TERM" ): - self.logger.warning( f"Got a {sig} signal, trying to die." ) - self.mustdie = True - - def launch_broker( self, brokerclass, pipe, **options ): - signal.signal( signal.SIGINT, - lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGINT" ) ) - signal.signal( signal.SIGTERM, - lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGTERM" ) ) - consumer = brokerclass( pipe=pipe, **options ) - consumer.poll() - - def broker_poll( self, *args, **options): - self.logger.info( "******** brokerpoll starting ***********" ) - - self.mustdie = False - signal.signal( signal.SIGTERM, lambda sig, stack: self.sigterm( "TERM" ) ) - signal.signal( signal.SIGINT, lambda sig, stack: self.sigterm( "INT" ) ) - - - brokerstodo = {} - if options['do_alerce']: - brokerstodo['alerce'] = AlerceConsumer - if options['do_antares']: - brokerstodo['antares'] = AntaresConsumer - if options['do_fink']: - brokerstodo['fink'] = FinkConsumer - if options['do_pitt']: - brokerstodo['pitt'] = PittGoogleBroker - if options['do_brahms']: - brokerstodo['brahms'] = BrahmsConsumer - if options['do_test']: - brokerstodo['test'] = TestConsumer - if len( brokerstodo ) == 0: - print( "Must give at least one broker to listen to." ) - - brokers = {} - - # Launch a process for each broker that will poll that broker indefinitely - - for name,brokerclass in brokerstodo.items(): - self.logger.info( f"Launching thread for {name}" ) - parentconn, childconn = multiprocessing.Pipe() - proc = multiprocessing.Process( target=self.launch_broker(brokerclass, childconn, **options) ) - proc.start() - brokers[name] = { "process": proc, - "pipe": parentconn, - "lastheartbeat": time.monotonic() } - - # Listen for a heartbeat from all processes. - # If we don't get a heartbeat for 5min, - # kill that process and restart it. - - heartbeatwait = 2 - toolongsilent = 300 - while not self.mustdie: - try: - pipelist = [ b['pipe'] for i,b in brokers.items() ] - whichpipe = multiprocessing.connection.wait( pipelist, timeout=heartbeatwait ) - - brokerstorestart = set() - for name, broker in brokers.items(): - try: - while broker['pipe'].poll(): - msg = broker['pipe'].recv() - if ( 'message' not in msg ) or ( msg['message'] != "ok" ): - self.logger.error( f"Got unexpected message from thread for {name}; " - f"will restart: {msg}" ) - brokerstorestart.add( name ) - else: - self.logger.debug( f"Got heartbeat from {name}" ) - broker['lastheartbeat'] = time.monotonic() - except Exception as ex: - self.logger.error( f"Got exception listening for heartbeat from {name}; will restart." ) - brokerstorestart.add( name ) - - for name, broker in brokers.items(): - dt = time.monotonic() - broker['lastheartbeat'] - if dt > toolongsilent: - self.logger.error( f"It's been {dt:.0f} seconds since last heartbeat from {name}; "f"will restart." ) - brokerstorestart.add( name ) - - for torestart in brokerstorestart: - self.logger.warning( f"Killing and restarting process for {torestart}" ) - brokers[torestart]['process'].kill() - brokers[torestart]['pipe'].close() - del brokers[torestart] - parentconn, childconn = multiprocessing.Pipe() - proc = multiprocessing.Process( target=lambda: self.launch_broker( brokerstodo[torestart], - childconn, **options ) ) - proc.start() - brokers[torestart] = { "process": proc, - "pipe": parentconn, - "lastheartbeat": time.monotonic() } - except Exception as ex: - self.logger.exception( "brokerpoll got an exception, going to shut down." ) - self.mustdie = True - - # I chose 20s since kubernetes sends a TERM and then waits 30s before shutting things down - self.logger.warning( "Shutting down. Sending die to all processes and waiting 20s" ) - for name, broker in brokers.items(): - broker['pipe'].send( { "command": "die" } ) - time.sleep( 20 ) - self.logger.warning( "Exiting." ) - return - - -if __name__ == '__main__': - - logger = logging.getLogger( "brokerpoll_baselogger" ) - logger.propagate = False - logout = logging.FileHandler( _logdir / f"logs/brokerpoll.log" ) - logger.addHandler( logout ) - formatter = logging.Formatter( f'[%(asctime)s - brokerpoll - %(levelname)s] - %(message)s',datefmt='%Y-%m-%d %H:%M:%S' ) - logout.setFormatter( formatter ) - logger.setLevel( logging.DEBUG ) - - - parser = argparse.ArgumentParser() - - parser.add_argument( '--do-alerce', action='store_true', default=False, help="Poll from ALeRCE" ) - parser.add_argument( '--alerce-topic-pattern', default='^lc_classifier_.*_(\d{4}\d{2}\d{2})$', - help='Regex for matching ALeRCE topics (warning: custom code, see AlerceBroker)' ) - parser.add_argument( '--do-antares', action='store_true', default=False, help="Poll from ANTARES" ) - parser.add_argument( '--antares-topic', default=None, help='Topic name for Antares' ) - parser.add_argument( '--do-fink', action='store_true', default=False, help="Poll from Fink" ) - parser.add_argument( '--fink-topic', default=None, help='Topic name for Fink' ) - parser.add_argument( '--do-brahms', action='store_true', default=False, - help="Poll from Rob's test kafka server" ) - parser.add_argument( '--brahms-topic', default=None, - help="Topic to poll on brahms (required if --do-brahms is True)" ) - parser.add_argument( '--do-pitt', action='store_true', default=False, help="Poll from PITT-Google" ) - parser.add_argument( '--pitt-topic', default=None, help="Topic name for PITT-Google" ) - parser.add_argument( '--pitt-project', default=None, help="Project name for PITT-Google" ) - parser.add_argument( '--do-test', action='store_true', default=False, - help="Poll from kafka-server:9092 (for testing purposes)" ) - parser.add_argument( '--test-topic', default='classifications', - help="Topic to poll from on kafka-server:9092" ) - parser.add_argument( '-g', '--grouptag', default=None, help="Tag to add to end of kafka group ids" ) - parser.add_argument('-r', '--reset', action='store_true', default=False, help='Reset all stream pointers') - - options = vars(parser.parse_args()) - - broker = Broker(**options) - - poll = broker.broker_poll(**options) diff --git a/logs/.placeholder b/logs/.placeholder new file mode 100644 index 00000000..20f00c0e --- /dev/null +++ b/logs/.placeholder @@ -0,0 +1 @@ +This is here so that the logs directory will be created on git checkout. \ No newline at end of file diff --git a/tests/docker-compose.yaml b/tests/docker-compose.yaml index 1fb537bb..dcc9a977 100644 --- a/tests/docker-compose.yaml +++ b/tests/docker-compose.yaml @@ -44,22 +44,29 @@ services: timeout: 10s retries: 5 - # cassandra: - # image: registry.nersc.gov/m1727/raknop/tom_cassandra - # build: - # context: ../docker_cassandra - # healthcheck: - # test: cqlsh -u cassandra -p cassandra cassandra < /dev/null - # interval: 10s - # timeout: 2s - # retries: 10 + mongodb: + image: registry.nersc.gov/m1727/rknop/tom-mongodb:latest + build: + context: ../docker_mongodb + environment: + MONGO_INITDB_ROOT_USERNAME: mongodb_admin + MONGO_INITDB_ROOT_PASSWORD: fragile + MONGO_ALERT_WRITER_USERNAME: mongodb_alert_writer + MONGO_ALERT_WRITER_PASSWORD: writer + MONGO_ALERT_READER_USERNAME: mongodb_alert_reader + MONGO_ALERT_READER_PASSWORD: reader + healthcheck: + test: netcat -w 1 localhost 27017 || exit 1 + interval: 5s + timeout: 10s + retries: 5 createdb: depends_on: postgres: condition: service_healthy - # cassandra: - # condition: service_healthy + mongodb: + condition: service_healthy image: registry.nersc.gov/m1727/raknop/tom_desc_bindmount build: context: ../ @@ -75,7 +82,11 @@ services: - type: bind source: ../secrets target: /secrets + - type: volume + source: logs + target: /logs environment: + LOGDIR: /logs DB_NAME: tom_desc DB_HOST: postgres DB_USER: postgres @@ -89,7 +100,7 @@ services: python tom_desc/manage.py create_test_superuser python tom_desc/manage.py create_test_apibroker python /tests/create_postgres_ro_user.py -# python tom_desc/manage.py sync_cassandra + tom: depends_on: @@ -110,17 +121,29 @@ services: - type: volume source: query_results target: /query_results + - type: volume + source: logs + target: /logs environment: + LOGDIR: /logs DB_NAME: tom_desc DB_HOST: postgres DB_USER: postgres DB_PASS: fragile DB_PORT: 5432 + MONGOHOST: mongodb + MONGODB_ADMIN: mongodb_admin + MONGODB_ADMIN_PASSWORD: fragile + MONGODB_ALERT_WRITER: mongodb_alert_writer + MONGODB_ALERT_WRITER_PASSWORD: writer + MONGODB_ALERT_READER: mongdb_alert_reader + MONGODB_ALERT_READER_PASSWORD: reader + brokerpoll: depends_on: - postgres: - condition: service_healthy + createdb: + condition: service_completed_successfully tom: condition: service_started fakebroker: @@ -137,7 +160,11 @@ services: - type: bind source: ../tom_desc target: /tom_desc + - type: volume + source: logs + target: /logs environment: + LOGDIR: /logs DB_NAME: tom_desc DB_HOST: postgres DB_USER: postgres @@ -145,16 +172,23 @@ services: DB_PORT: 5432 entrypoint: [ "python", "manage.py", "brokerpoll2", "--do-test" ] + + # Thought required: want to make this dependent on + # createdb completed successfully, or just on the + # database servers being up? The advantage of the latter + # is that you can get shell started even if there are + # database migration errors, and use shell to diagnose/fix + # them. shell: depends_on: postgres: condition: service_healthy - # cassandra: - # condition: service_healthy + mongodb: + condition: service_healthy tom: condition: service_started fakebroker: - condition: service_started + condition: service_started brokerpoll: condition: service_started image: registry.nersc.gov/m1727/raknop/tom_server_bindmount_dev @@ -178,55 +212,25 @@ services: - type: volume source: query_results target: /query_results + - type: volume + source: logs + target: /logs environment: + LOGDIR: /logs DB_NAME: tom_desc DB_HOST: postgres DB_USER: postgres DB_PASS: fragile DB_PORT: 5432 + MONGOHOST: mongodb + MONGODB_ADMIN: mongodb_admin + MONGODB_ADMIN_PASSWORD: fragile + MONGODB_ALERT_WRITER: mongodb_alert_writer + MONGODB_ALERT_WRITER_PASSWORD: writer + MONGODB_ALERT_READER: mongdb_alert_reader + MONGODB_ALERT_READER_PASSWORD: reader entrypoint: [ "tail", "-f", "/etc/issue" ] - # This won't work right now, there's scary things - # about test data and such - # runtests: - # depends_on: - # tom: - # condition: service_started - # kafka-server: - # condition: service_healthy - # fakebroker: - # condition: service_started - # brokerpoll: - # condition: service_started - # image: registry.nersc.gov/m1727/raknop/tom_server_bindmount_dev - # build: - # context: ../ - # dockerfile: docker_server/Dockerfile - # target: tom-server-bindmount-dev - # volumes: - # - type: bind - # source: ../secrets - # target: /secrets - # - type: bind - # source: ../tom_desc - # target: /tom_desc - # - type: bind - # source: . - # target: /tests - # - type: bind - # source: ${ELASTICC2_TEST_DATA:-/dev/null} - # target: /elasticc2data - # - type: volume - # source: query_results - # target: /query_results - # working_dir: /tests - # environment: - # DB_NAME: tom_desc - # DB_HOST: postgres - # DB_USER: postgres - # DB_PASS: fragile - # DB_PORT: 5432 - # entrypoint: [ "pytest", "-v" ] - volumes: query_results: + logs: diff --git a/tom_desc/db/management/commands/_brokerconsumer.py b/tom_desc/db/management/commands/_brokerconsumer.py new file mode 100644 index 00000000..74f31a2a --- /dev/null +++ b/tom_desc/db/management/commands/_brokerconsumer.py @@ -0,0 +1,488 @@ +import sys +import os +import io +import time +import datetime +import traceback +import pathlib +import urllib +import logging + +import confluent_kafka +import fastavro +from pymongo import MongoClient + +# TODO : uncomment this next line +# and the whole PittGoogleBroker class +# when pittgoogle works again +# import pittgoogle + +_rundir = pathlib.Path(__file__).parent +_djangodir = _rundir.parent.parent.parent +_logdir = pathlib.Path( os.getenv( 'LOGDIR', '/logs' ) ) + +# sys.path.insert( 0, str(_rundir) ) +# Add the db/management/commands directory as we include stuff from there +sys.path.append( str( _djangodir / "db/management/commands" ) ) +from _consumekafkamsgs import MsgConsumer + + +class BrokerConsumer: + """A class for consuming broker messages from brokers. + + Currently supports only kafka brokers, though there is some + (currently broken and commented out) code for pulling from the + pubsub Pitt-Google broker. + + Currently may assume that broker messages are coming in the elasticc2 v0.91 schema. + + """ + + def __init__( self, server, groupid, topics=None, updatetopics=False, + schemaless=True, reset=False, extraconfig={}, + schemafile=None, pipe=None, loggername="BROKER", + postgres_brokermessage_model=None, mongodb_dbname=None, mongodb_collection=None, + **kwargs ): + + self.logger = logging.getLogger( loggername ) + self.logger.propagate = False + logout = logging.StreamHandler( sys.stderr ) + self.logger.addHandler( logout ) + formatter = logging.Formatter( f'[%(asctime)s - {loggername} - %(levelname)s] - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' ) + logout.setFormatter( formatter ) + # self.logger.setLevel( logging.INFO ) + self.logger.setLevel( logging.DEBUG ) + + self.countlogger = logging.getLogger( f"countlogger_{loggername}" ) + self.countlogger.propagate = False + _countlogout = logging.FileHandler( _logdir / f"brokerpoll_counts_{loggername}.log" ) + _countformatter = logging.Formatter( f'[%(asctime)s - %(levelname)s] - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' ) + _countlogout.setFormatter( _countformatter ) + self.countlogger.addHandler( _countlogout ) + self.countlogger.setLevel( logging.INFO ) + # self.countlogger.setLevel( logging.DEBUG ) + + if schemafile is None: + schemafile = _djangodir / "elasticc2/management/commands/elasticc.v0_9_1.brokerClassification.avsc" + + self.countlogger.info( f"************ Starting Brokerconsumer for {loggername} ****************" ) + + self.pipe = pipe + self.server = server + self.groupid = groupid + self.topics = topics + self._updatetopics = updatetopics + self._reset = reset + self.extraconfig = extraconfig + + self.schemaless = schemaless + if not self.schemaless: + self.countlogger.error( "CRASHING. I only know how to handle schemaless streams." ) + raise RuntimeError( "I only know how to handle schemaless streams" ) + self.schemafile = schemafile + self.schema = fastavro.schema.load_schema( self.schemafile ) + + self.nmessagesconsumed = 0 + + # Figure out where we're saving stuff. postgres_brokermessage_model right + # now only works with elasticc2.models.BrokerMessage. Before trying it + # with anything else, make sure the interface works right and that it + # understands the alert schema. + # mongodb_dbname is the name of the Mongo database on the mongodb running on $MONGOHOST + # + # This class supports saving to *both*, but usually you will probably only do + # one or the other. + + self.postgres_brokermessage_model = postgres_brokermessage_model + self.mongodb_dbname = mongodb_dbname + self.mongodb_collection = mongodb_collection + if ( self.mongodb_dbname is None ) != ( self.mongodb_collection is None ): + raise ValueError( "Must give either both or neither of mongodb_name and mongodb_collection" ) + + if self.postgres_brokermessage_model is not None: + self.logger.info( f"Writing broker messages to postgres model " + f"{self.postgres_brokermessage_model.__name__}" ) + if self.mongodb_dbname is not None: + # mongodb running on port 27017 on host $MONGOHOST; default + # $MONGOHOST to fastdbdev-mongodb for backwards compatibility + # with previous installs + self.mongohost = os.getenv( 'MONGOHOST', 'fastdbdev-mongodb' ) + self.mongousername = urllib.parse.quote_plus(os.environ['MONGODB_ALERT_WRITER']) + self.mongopassword = urllib.parse.quote_plus(os.environ['MONGODB_ALERT_WRITER_PASSWORD']) + self.logger.info( f"Writing broker messages to monogdb {self.mongodb_dbname} " + f"collection {self.mongodb_collection}" ) + # Thought required: it would be less overhead to connect to the mongo hosdt + # once here and just reuse that connection. However, if the mongodb restarts + # or the connection becomes invalid for any reason, we might regret holding + # the connection open for hours. On the assumption that the mongo connection + # overhead is going to be small compared to the time it takes to receive + # a batch of messages, just connect to the mongodb every time we handle + # a message batch. TODO: test this to see if that assumption is correct. + + + @property + def reset( self ): + return self._reset + + @reset.setter + def reset( self, val ): + self._reset = val + + def create_connection( self ): + countdown = 5 + while countdown >= 0: + try: + self.consumer = MsgConsumer( self.server, self.groupid, self.schemafile, self.topics, + extraconsumerconfig=self.extraconfig, + consume_nmsgs=1000, consume_timeout=1, nomsg_sleeptime=5, + logger=self.logger ) + countdown = -1 + except Exception as e: + countdown -= 1 + strio = io.StringIO("") + strio.write( f"Exception connecting to broker: {str(e)}" ) + traceback.print_exc( file=strio ) + self.logger.warning( strio.getvalue() ) + if countdown >= 0: + self.logger.warning( "Sleeping 5s and trying again." ) + time.sleep(5) + else: + self.logger.error( "Repeated exceptions connecting to broker, punting." ) + self.countlogger.error( "Repeated exceptions connecting to broker, punting." ) + raise RuntimeError( "Failed to connect to broker" ) + + if self._reset and ( self.topics is not None ): + self.countlogger.info( f"*************** Resetting to start of broker kafka stream ***************" ) + self.reset_to_start() + # Only want to reset the first time the connection is opened! + self._reset = False + + self.countlogger.info( f"**************** Consumer connection opened *****************" ) + + def close_connection( self ): + self.countlogger.info( f"**************** Closing consumer connection ******************" ) + self.consumer.close() + self.consumer = None + + def update_topics( self, *args, **kwargs ): + self.countlogger.info( "Subclass must implement this if you use it." ) + raise NotImplementedError( "Subclass must implement this if you use it." ) + + def reset_to_start( self ): + self.logger.info( "Resetting all topics to start" ) + for topic in self.topics: + self.consumer.reset_to_start( topic ) + + def handle_message_batch( self, msgs ): + messagebatch = [] + self.countlogger.info( f"Handling {len(msgs)} messages; consumer has received " + f"{self.consumer.tot_handled} messages." ) + for msg in msgs: + timestamptype, timestamp = msg.timestamp() + + + if timestamptype == confluent_kafka.TIMESTAMP_NOT_AVAILABLE: + timestamp = None + else: + timestamp = datetime.datetime.fromtimestamp( timestamp / 1000, tz=datetime.timezone.utc ) + + payload = msg.value() + if not self.schemaless: + self.countlogger.error( "I only know how to handle schemaless streams" ) + raise RuntimeError( "I only know how to handle schemaless streams" ) + alert = fastavro.schemaless_reader( io.BytesIO( payload ), self.schema ) + messagebatch.append( { 'topic': msg.topic(), + 'msgoffset': msg.offset(), + 'timestamp': timestamp, + 'msg': alert } ) + if self.postgres_brokermessage_model is not None: + added = self.postgres_brokermessage_model.load_batch( messagebatch, logger=self.logger ) + self.countlogger.info( f"...added {added['addedmsgs']} messages, " + f"{added['addedclassifiers']} classifiers, " + f"{added['addedclassifications']} classifications. " ) + if self.mongodb_dbname is not None: + nadded = self.mongodb_store( messagebatch ) + self.countlogger.info( f"...added {nadded} messages to mongodb {self.mongodb_dbname} " + f"collection {self.mongodb_collection}" ) + + + def mongodb_store(self, messagebatch=None): + if messagebatch is None: + return 0 + client = MongoClient( f"mongodb://{self.mongousername}:{self.mongopassword}@{self.mongohost}:27017/" + f"?authSource={self.mongodb_dbname}" ) + db = getattr( client, self.mongodb_dbname ) + collection = db[ self.mongodb_collection ] + results = collection.insert_many(messagebatch) + return len(results.inserted_ids) + + + def poll( self, restart_time=datetime.timedelta(minutes=30) ): + self.create_connection() + while True: + if self._updatetopics: + self.update_topics() + strio = io.StringIO("") + if len(self.consumer.topics) == 0: + self.logger.info( "No topics, will wait 10s and reconnect." ) + time.sleep(10) + else: + self.logger.info( f"Subscribed to topics: {self.consumer.topics}; starting poll loop." ) + self.countlogger.info( f"Subscribed to topics: {self.consumer.topics}; starting poll loop." ) + try: + happy = self.consumer.poll_loop( handler=self.handle_message_batch, + max_consumed=None, max_runtime=restart_time, + pipe=self.pipe ) + if happy: + strio.write( f"Reached poll timeout for {self.server}; " + f"handled {self.consumer.tot_handled} messages. " ) + else: + strio.write( f"Poll loop received die command after handling " + f"{self.consumer.tot_handled} messages. Exiting." ) + self.logger.info( strio.getvalue() ) + self.countlogger.info( strio.getvalue() ) + self.close_connection() + return + except Exception as e: + otherstrio = io.StringIO("") + traceback.print_exc( file=otherstrio ) + self.logger.warning( otherstrio.getvalue() ) + strio.write( f"Exception polling: {str(e)}. " ) + + if self.pipe.poll(): + msg = self.pipe.recv() + if ( 'command' in msg ) and ( msg['command'] == 'die' ): + self.logger.info( "No topics, but also exiting broker poll due to die command." ) + self.countlogger.info( "No topics, but also existing broker poll due to die command." ) + self.close_connection() + return + strio.write( "Reconnecting.\n" ) + self.logger.info( strio.getvalue() ) + self.countlogger.info( strio.getvalue() ) + self.close_connection() + if self._updatetopics: + self.topics = None + self.create_connection() + +# ====================================================================== +# I should replace this and the next one with a generic noauth consumer + +class BrahmsConsumer(BrokerConsumer): + def __init__( self, grouptag=None, brahms_topic=None, loggername="BRAHMS", **kwargs ): + if brahms_topic is None: + raise RuntimeError( "Must specify brahms topic" ) + server = "brahms.lbl.gov:9092" + groupid = "elasticc-lbnl" + ("" if grouptag is None else "-" + grouptag ) + topics = [ brahms_topic ] + super().__init__( server, groupid, topics=topics, loggername=loggername, **kwargs ) + self.logger.info( f"Brahms group id is {groupid}, topic is {brahms_topic}" ) + +# ====================================================================== +# This consumer is used in the tests + +class TestConsumer(BrokerConsumer): + def __init__( self, grouptag=None, test_topic=None, loggername="TEST", **kwargs ): + if test_topic is None: + raise RuntimeError( "Must specify test topic" ) + server = "kafka-server:9092" + groupid = "testing" + ("" if grouptag is None else "-" + grouptag ) + topics = [ test_topic ] + super().__init__( server, groupid, topics=topics, loggername=loggername, **kwargs ) + self.logger.info( f"Test group id is {groupid}, topic is {test_topic}" ) + + +# ====================================================================== + +class AntaresConsumer(BrokerConsumer): + def __init__( self, grouptag=None, + usernamefile='/secrets/antares_username', passwdfile='/secrets/antares_passwd', + loggername="ANTARES", antares_topic='elasticc2-st1-ddf-full', **kwargs ): + server = "kafka.antares.noirlab.edu:9092" + groupid = "elasticc-lbnl" + ( "" if grouptag is None else "-" + grouptag ) + topics = [ antares_topic ] + updatetopics = False + with open( usernamefile ) as ifp: + username = ifp.readline().strip() + with open( passwdfile ) as ifp: + passwd = ifp.readline().strip() + extraconfig = { + "api.version.request": True, + "broker.version.fallback": "0.10.0.0", + "api.version.fallback.ms": "0", + "enable.auto.commit": True, + "security.protocol": "SASL_SSL", + "sasl.mechanism": "PLAIN", + "sasl.username": username, + "sasl.password": passwd, + "ssl.ca.location": str( _rundir / "antares-ca.pem" ), + "auto.offset.reset": "earliest", + } + super().__init__( server, groupid, topics=topics, updatetopics=updatetopics, + extraconfig=extraconfig, loggername=loggername, **kwargs ) + self.logger.info( f"Antares group id is {groupid}" ) + + +# ====================================================================== + +class FinkConsumer(BrokerConsumer): + def __init__( self, grouptag=None, loggername="FINK", fink_topic='fink_elasticc-2022fall', **kwargs ): + server = "134.158.74.95:24499" + groupid = "elasticc-lbnl" + ( "" if grouptag is None else "-" + grouptag ) + topics = [ fink_topic ] + updatetopics = False + super().__init__( server, groupid, topics=topics, updatetopics=updatetopics, + loggername=loggername, **kwargs ) + self.logger.info( f"Fink group id is {groupid}" ) + + +# ====================================================================== + +class AlerceConsumer(BrokerConsumer): + def __init__( self, + grouptag=None, + usernamefile='/secrets/alerce_username', + passwdfile='/secrets/alerce_passwd', + loggername="ALERCE", + early_offset=os.getenv( "ALERCE_TOPIC_RELDATEOFFSET", -4 ), + alerce_topic_pattern='^lc_classifier_.*_(\d{4}\d{2}\d{2})$', + **kwargs ): + server = os.getenv( "ALERCE_KAFKA_SERVER", "kafka.alerce.science:9093" ) + groupid = "elasticc-lbnl" + ( "" if grouptag is None else "-" + grouptag ) + self.early_offset = int( early_offset ) + self.alerce_topic_pattern = alerce_topic_pattern + topics = None + updatetopics = True + with open( usernamefile ) as ifp: + username = ifp.readline().strip() + with open( passwdfile ) as ifp: + passwd = ifp.readline().strip() + extraconfig = { "security.protocol": "SASL_SSL", + "sasl.mechanism": "SCRAM-SHA-512", + "sasl.username": username, + "sasl.password": passwd } + super().__init__( server, groupid, topics=topics, updatetopics=updatetopics, extraconfig=extraconfig, + loggername=loggername, **kwargs ) + self.logger.info( f"Alerce group id is {groupid}" ) + + self.badtopics = [ 'lc_classifier_balto_20230807' ] + + def update_topics( self, *args, **kwargs ): + now = datetime.datetime.now() + datestrs = [] + for ddays in range(self.early_offset, 3): + then = now + datetime.timedelta( days=ddays ) + datestrs.append( f"{then.year:04d}{then.month:02d}{then.day:02d}" ) + tosub = [] + topics = self.consumer.get_topics() + for topic in topics: + match = re.search( self.alerce_topic_pattern, topic ) + if match and ( match.group(1) in datestrs ) and ( topic not in self.badtopics ): + tosub.append( topic ) + self.topics = tosub + self.consumer.subscribe( self.topics ) + +# ===================================================================== + +# class PittGoogleBroker(BrokerConsumer): +# def __init__( +# self, +# pitt_topic: str, +# pitt_project: str, +# max_workers: int = 8, # max number of ThreadPoolExecutor workers +# batch_maxn: int = 1000, # max number of messages in a batch +# batch_maxwait: int = 5, # max seconds to wait between messages before processing a batch +# loggername: str = "PITTGOOGLE", +# **kwargs +# ): +# super().__init__(server=None, groupid=None, loggername=loggername, **kwargs) + +# topic = pittgoogle.pubsub.Topic(pitt_topic, pitt_project) +# subscription = pittgoogle.pubsub.Subscription(name=f"{pitt_topic}-desc", topic=topic) +# # if the subscription doesn't already exist, this will create one in the +# # project given by the env var GOOGLE_CLOUD_PROJECT +# subscription.touch() + +# self.consumer = pittgoogle.pubsub.Consumer( +# subscription=subscription, +# msg_callback=self.handle_message, +# batch_callback=self.handle_message_batch, +# batch_maxn=batch_maxn, +# batch_maxwait=batch_maxwait, +# executor=ThreadPoolExecutor( +# max_workers=max_workers, +# initializer=self.worker_init, +# initargs=( +# self.schema, +# subscription.topic.name, +# self.logger, +# self.countlogger +# ), +# ), +# ) + +# @staticmethod +# def worker_init(classification_schema: dict, pubsub_topic: str, +# broker_logger: logging.Logger, broker_countlogger: logging.Logger ): +# + + """Initializer for the ThreadPoolExecutor.""" +# global countlogger +# global logger +# global schema +# global topic + +# countlogger = broker_countlogger +# logger = broker_logger +# schema = classification_schema +# topic = pubsub_topic + +# logger.info( "In worker_init" ) + +# @staticmethod +# def handle_message(alert: pittgoogle.pubsub.Alert) -> pittgoogle.pubsub.Response: +# """Callback that will process a single message. This will run in a background thread.""" +# global logger +# global schema +# global topic + +# logger.info( "In handle_message" ) + +# message = { +# "msg": fastavro.schemaless_reader(io.BytesIO(alert.bytes), schema), +# "topic": topic, +# # this is a DatetimeWithNanoseconds, a subclass of datetime.datetime +# # https://googleapis.dev/python/google-api-core/latest/helpers.html +# "timestamp": alert.metadata["publish_time"].astimezone(datetime.timezone.utc), +# # there is no offset in pubsub +# # if this cannot be null, perhaps the message id would work? +# "msgoffset": alert.metadata["message_id"], +# } + +# return pittgoogle.pubsub.Response(result=message, ack=True) + +# @staticmethod +# def handle_message_batch(messagebatch: list) -> None: +# """Callback that will process a batch of messages. This will run in the main thread.""" +# global logger +# global countlogger + +# logger.info( "In handle_message_batch" ) +# # import pdb; pdb.set_trace() + +# added = BrokerMessage.load_batch(messagebatch, logger=logger) +# countlogger.info( +# f"...added {added['addedmsgs']} messages, " +# f"{added['addedclassifiers']} classifiers, " +# f"{added['addedclassifications']} classifications. " +# ) + +# def poll(self): +# # this blocks indefinitely or until a fatal error +# # use Control-C to exit +# self.consumer.stream( pipe=self.pipe, heartbeat=60 ) + + + + diff --git a/tom_desc/elasticc2/management/commands/brokerpoll2.py b/tom_desc/elasticc2/management/commands/brokerpoll2.py index 41231878..ca4fb68a 100644 --- a/tom_desc/elasticc2/management/commands/brokerpoll2.py +++ b/tom_desc/elasticc2/management/commands/brokerpoll2.py @@ -12,417 +12,24 @@ import multiprocessing import fastavro import confluent_kafka -# TODO : uncomment this next line -# and the whole PittGoogleBroker class -# when pittgoogle works again -# import pittgoogle from concurrent.futures import ThreadPoolExecutor # for pittgoogle import django.db from django.core.management.base import BaseCommand, CommandError -from elasticc2.models import BrokerMessage +import elasticc2.models _rundir = pathlib.Path(__file__).parent -sys.path.insert(0, str(_rundir) ) +_djangodir = _rundir.parent.parent.parent +_logdir = pathlib.Path( os.getenv( 'LOGDIR', '/logs' ) ) + # Add the db/management/commands directory as we include stuff from there sys.path.append( str(_rundir.parent.parent.parent / "db/management/commands" ) ) -from _consumekafkamsgs import MsgConsumer - -# class DateTimeEncoder( json.JSONEncoder ): -# def default( self, obj ): -# if isinstance( obj, datetime.datetime ): -# return obj.isoformat() -# else: -# return super().default( obj ) - -# ====================================================================== - -class BrokerConsumer: - def __init__( self, server, groupid, topics=None, updatetopics=False, - schemaless=True, reset=False, extraconfig={}, - schemafile=None, pipe=None, loggername="BROKER", **kwargs ): - - self.logger = logging.getLogger( loggername ) - self.logger.propagate = False - logout = logging.StreamHandler( sys.stderr ) - self.logger.addHandler( logout ) - formatter = logging.Formatter( f'[%(asctime)s - {loggername} - %(levelname)s] - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' ) - logout.setFormatter( formatter ) - # self.logger.setLevel( logging.INFO ) - self.logger.setLevel( logging.DEBUG ) - - self.countlogger = logging.getLogger( f"countlogger_{loggername}" ) - self.countlogger.propagate = False - _countlogfile = _rundir.parent.parent.parent / f"logs/brokerpoll_counts_{loggername}.log" - _countlogfile.parent.mkdir( parents=True, exist_ok=True ) - _countlogout = logging.FileHandler( _countlogfile ) - _countformatter = logging.Formatter( f'[%(asctime)s - %(levelname)s] - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' ) - _countlogout.setFormatter( _countformatter ) - self.countlogger.addHandler( _countlogout ) - self.countlogger.setLevel( logging.INFO ) - - if schemafile is None: - schemafile = _rundir / "elasticc.v0_9_1.brokerClassification.avsc" - - self.countlogger.info( f"************ Starting Brokerconsumer for {loggername} ****************" ) - - self.pipe = pipe - self.server = server - self.groupid = groupid - self.topics = topics - self._updatetopics = updatetopics - self._reset = reset - self.extraconfig = extraconfig - - self.schemaless = schemaless - if not self.schemaless: - self.countlogger.error( "CRASHING. I only know how to handle schemaless streams." ) - raise RuntimeError( "I only know how to handle schemaless streams" ) - self.schemafile = schemafile - self.schema = fastavro.schema.load_schema( self.schemafile ) - - self.nmessagesconsumed = 0 - - - @property - def reset( self ): - return self._reset - - @reset.setter - def reset( self, val ): - self._reset = val - - def create_connection( self ): - countdown = 5 - while countdown >= 0: - try: - self.consumer = MsgConsumer( self.server, self.groupid, self.schemafile, self.topics, - extraconsumerconfig=self.extraconfig, - consume_nmsgs=1000, consume_timeout=1, nomsg_sleeptime=5, - logger=self.logger ) - countdown = -1 - except Exception as e: - countdown -= 1 - strio = io.StringIO("") - strio.write( f"Exception connecting to broker: {str(e)}" ) - traceback.print_exc( file=strio ) - self.logger.warning( strio.getvalue() ) - if countdown >= 0: - self.logger.warning( "Sleeping 5s and trying again." ) - time.sleep(5) - else: - self.logger.error( "Repeated exceptions connecting to broker, punting." ) - self.countlogger.error( "Repeated exceptions connecting to broker, punting." ) - raise RuntimeError( "Failed to connect to broker" ) - - if self._reset and ( self.topics is not None ): - self.countlogger.info( f"*************** Resetting to start of broker kafka stream ***************" ) - self.reset_to_start() - # Only want to reset the first time the connection is opened! - self._reset = False - - self.countlogger.info( f"**************** Consumer connection opened *****************" ) - - def close_connection( self ): - self.countlogger.info( f"**************** Closing consumer connection ******************" ) - self.consumer.close() - self.consumer = None - - def update_topics( self, *args, **kwargs ): - self.countlogger.info( "Subclass must implement this if you use it." ) - raise NotImplementedError( "Subclass must implement this if you use it." ) - - def reset_to_start( self ): - self.logger.info( "Resetting all topics to start" ) - for topic in self.topics: - self.consumer.reset_to_start( topic ) - - def handle_message_batch( self, msgs ): - messagebatch = [] - self.countlogger.info( f"Handling {len(msgs)} messages; consumer has received " - f"{self.consumer.tot_handled} messages." ) - for msg in msgs: - timestamptype, timestamp = msg.timestamp() - if timestamptype == confluent_kafka.TIMESTAMP_NOT_AVAILABLE: - timestamp = None - else: - timestamp = datetime.datetime.fromtimestamp( timestamp / 1000, - tz=datetime.timezone.utc ) - payload = msg.value() - if not self.schemaless: - self.countlogger.error( "I only know how to handle schemaless streams" ) - raise RuntimeError( "I only know how to handle schemaless streams" ) - alert = fastavro.schemaless_reader( io.BytesIO( payload ), self.schema ) - messagebatch.append( { 'topic': msg.topic(), - 'msgoffset': msg.offset(), - 'timestamp': timestamp, - 'msg': alert } ) - added = BrokerMessage.load_batch( messagebatch, logger=self.logger ) - self.countlogger.info( f"...added {added['addedmsgs']} messages, " - f"{added['addedclassifiers']} classifiers, " - f"{added['addedclassifications']} classifications. " ) - - def poll( self, restart_time=datetime.timedelta(minutes=30) ): - self.create_connection() - while True: - if self._updatetopics: - self.update_topics() - strio = io.StringIO("") - if len(self.consumer.topics) == 0: - self.logger.info( "No topics, will wait 10s and reconnect." ) - time.sleep(10) - else: - self.logger.info( f"Subscribed to topics: {self.consumer.topics}; starting poll loop." ) - self.countlogger.info( f"Subscribed to topics: {self.consumer.topics}; starting poll loop." ) - try: - happy = self.consumer.poll_loop( handler=self.handle_message_batch, - max_consumed=None, max_runtime=restart_time, - pipe=self.pipe ) - if happy: - strio.write( f"Reached poll timeout for {self.server}; " - f"handled {self.consumer.tot_handled} messages. " ) - else: - strio.write( f"Poll loop received die command after handling " - f"{self.consumer.tot_handled} messages. Exiting." ) - self.logger.info( strio.getvalue() ) - self.countlogger.info( strio.getvalue() ) - self.close_connection() - return - except Exception as e: - otherstrio = io.StringIO("") - traceback.print_exc( file=otherstrio ) - self.logger.warning( otherstrio.getvalue() ) - strio.write( f"Exception polling: {str(e)}. " ) - - if self.pipe.poll(): - msg = self.pipe.recv() - if ( 'command' in msg ) and ( msg['command'] == 'die' ): - self.logger.info( "No topics, but also exiting broker poll due to die command." ) - self.countlogger.info( "No topics, but also existing broker poll due to die command." ) - self.close_connection() - return - strio.write( "Reconnecting.\n" ) - self.logger.info( strio.getvalue() ) - self.countlogger.info( strio.getvalue() ) - self.close_connection() - if self._updatetopics: - self.topics = None - self.create_connection() - -# ====================================================================== -# I should replace this and the next one with a generic noauth consumer - -class BrahmsConsumer(BrokerConsumer): - def __init__( self, grouptag=None, brahms_topic=None, loggername="BRAHMS", **kwargs ): - if brahms_topic is None: - raise RuntimeError( "Must specify brahms topic" ) - server = "brahms.lbl.gov:9092" - groupid = "elasticc-lbnl" + ("" if grouptag is None else "-" + grouptag ) - topics = [ brahms_topic ] - super().__init__( server, groupid, topics=topics, loggername=loggername, **kwargs ) - self.logger.info( f"Brahms group id is {groupid}, topic is {brahms_topic}" ) - -# ====================================================================== - -class TestConsumer(BrokerConsumer): - def __init__( self, grouptag=None, test_topic=None, loggername="TEST", **kwargs ): - if test_topic is None: - raise RuntimeError( "Must specify test topic" ) - server = "kafka-server:9092" - groupid = "testing" + ("" if grouptag is None else "-" + grouptag ) - topics = [ test_topic ] - super().__init__( server, groupid, topics=topics, loggername=loggername, **kwargs ) - self.logger.info( f"Test group id is {groupid}, topic is {test_topic}" ) - -# ====================================================================== - -class AntaresConsumer(BrokerConsumer): - def __init__( self, grouptag=None, - usernamefile='/secrets/antares_username', passwdfile='/secrets/antares_passwd', - loggername="ANTARES", antares_topic='elasticc2-st1-ddf-full', **kwargs ): - server = "kafka.antares.noirlab.edu:9092" - groupid = "elasticc-lbnl" + ( "" if grouptag is None else "-" + grouptag ) - topics = [ antares_topic ] - updatetopics = False - with open( usernamefile ) as ifp: - username = ifp.readline().strip() - with open( passwdfile ) as ifp: - passwd = ifp.readline().strip() - extraconfig = { - "api.version.request": True, - "broker.version.fallback": "0.10.0.0", - "api.version.fallback.ms": "0", - "enable.auto.commit": True, - "security.protocol": "SASL_SSL", - "sasl.mechanism": "PLAIN", - "sasl.username": username, - "sasl.password": passwd, - "ssl.ca.location": str( _rundir / "antares-ca.pem" ), - "auto.offset.reset": "earliest", - } - super().__init__( server, groupid, topics=topics, updatetopics=updatetopics, - extraconfig=extraconfig, loggername=loggername, **kwargs ) - self.logger.info( f"Antares group id is {groupid}" ) - - -# ====================================================================== - -class FinkConsumer(BrokerConsumer): - def __init__( self, grouptag=None, loggername="FINK", fink_topic='fink_elasticc-2022fall', **kwargs ): - server = "134.158.74.95:24499" - groupid = "elasticc-lbnl" + ( "" if grouptag is None else "-" + grouptag ) - topics = [ fink_topic ] - updatetopics = False - super().__init__( server, groupid, topics=topics, updatetopics=updatetopics, - loggername=loggername, **kwargs ) - self.logger.info( f"Fink group id is {groupid}" ) - - -# ====================================================================== - -class AlerceConsumer(BrokerConsumer): - def __init__( self, - grouptag=None, - usernamefile='/secrets/alerce_username', - passwdfile='/secrets/alerce_passwd', - loggername="ALERCE", - early_offset=os.getenv( "ALERCE_TOPIC_RELDATEOFFSET", -4 ), - alerce_topic_pattern='^lc_classifier_.*_(\d{4}\d{2}\d{2})$', - **kwargs ): - server = os.getenv( "ALERCE_KAFKA_SERVER", "kafka.alerce.science:9093" ) - groupid = "elasticc-lbnl" + ( "" if grouptag is None else "-" + grouptag ) - self.early_offset = int( early_offset ) - self.alerce_topic_pattern = alerce_topic_pattern - topics = None - updatetopics = True - with open( usernamefile ) as ifp: - username = ifp.readline().strip() - with open( passwdfile ) as ifp: - passwd = ifp.readline().strip() - extraconfig = { "security.protocol": "SASL_SSL", - "sasl.mechanism": "SCRAM-SHA-512", - "sasl.username": username, - "sasl.password": passwd } - super().__init__( server, groupid, topics=topics, updatetopics=updatetopics, extraconfig=extraconfig, - loggername=loggername, **kwargs ) - self.logger.info( f"Alerce group id is {groupid}" ) - - self.badtopics = [ 'lc_classifier_balto_20230807' ] - - def update_topics( self, *args, **kwargs ): - now = datetime.datetime.now() - datestrs = [] - for ddays in range(self.early_offset, 3): - then = now + datetime.timedelta( days=ddays ) - datestrs.append( f"{then.year:04d}{then.month:02d}{then.day:02d}" ) - tosub = [] - topics = self.consumer.get_topics() - for topic in topics: - match = re.search( self.alerce_topic_pattern, topic ) - if match and ( match.group(1) in datestrs ) and ( topic not in self.badtopics ): - tosub.append( topic ) - self.topics = tosub - self.consumer.subscribe( self.topics ) - -# ===================================================================== - -# class PittGoogleBroker(BrokerConsumer): -# def __init__( -# self, -# pitt_topic: str, -# pitt_project: str, -# max_workers: int = 8, # max number of ThreadPoolExecutor workers -# batch_maxn: int = 1000, # max number of messages in a batch -# batch_maxwait: int = 5, # max seconds to wait between messages before processing a batch -# loggername: str = "PITTGOOGLE", -# **kwargs -# ): -# super().__init__(server=None, groupid=None, loggername=loggername, **kwargs) - -# topic = pittgoogle.pubsub.Topic(pitt_topic, pitt_project) -# subscription = pittgoogle.pubsub.Subscription(name=f"{pitt_topic}-desc", topic=topic) -# # if the subscription doesn't already exist, this will create one in the -# # project given by the env var GOOGLE_CLOUD_PROJECT -# subscription.touch() - -# self.consumer = pittgoogle.pubsub.Consumer( -# subscription=subscription, -# msg_callback=self.handle_message, -# batch_callback=self.handle_message_batch, -# batch_maxn=batch_maxn, -# batch_maxwait=batch_maxwait, -# executor=ThreadPoolExecutor( -# max_workers=max_workers, -# initializer=self.worker_init, -# initargs=( -# self.schema, -# subscription.topic.name, -# self.logger, -# self.countlogger -# ), -# ), -# ) - -# @staticmethod -# def worker_init(classification_schema: dict, pubsub_topic: str, -# broker_logger: logging.Logger, broker_countlogger: logging.Logger ): -# """Initializer for the ThreadPoolExecutor.""" -# global countlogger -# global logger -# global schema -# global topic - -# countlogger = broker_countlogger -# logger = broker_logger -# schema = classification_schema -# topic = pubsub_topic - -# logger.info( "In worker_init" ) - -# @staticmethod -# def handle_message(alert: pittgoogle.pubsub.Alert) -> pittgoogle.pubsub.Response: -# """Callback that will process a single message. This will run in a background thread.""" -# global logger -# global schema -# global topic - -# logger.info( "In handle_message" ) - -# message = { -# "msg": fastavro.schemaless_reader(io.BytesIO(alert.bytes), schema), -# "topic": topic, -# # this is a DatetimeWithNanoseconds, a subclass of datetime.datetime -# # https://googleapis.dev/python/google-api-core/latest/helpers.html -# "timestamp": alert.metadata["publish_time"].astimezone(datetime.timezone.utc), -# # there is no offset in pubsub -# # if this cannot be null, perhaps the message id would work? -# "msgoffset": alert.metadata["message_id"], -# } - -# return pittgoogle.pubsub.Response(result=message, ack=True) - -# @staticmethod -# def handle_message_batch(messagebatch: list) -> None: -# """Callback that will process a batch of messages. This will run in the main thread.""" -# global logger -# global countlogger - -# logger.info( "In handle_message_batch" ) -# # import pdb; pdb.set_trace() - -# added = BrokerMessage.load_batch(messagebatch, logger=logger) -# countlogger.info( -# f"...added {added['addedmsgs']} messages, " -# f"{added['addedclassifiers']} classifiers, " -# f"{added['addedclassifications']} classifications. " -# ) - -# def poll(self): -# # this blocks indefinitely or until a fatal error -# # use Control-C to exit -# self.consumer.stream( pipe=self.pipe, heartbeat=60 ) - +from _brokerconsumer import ( BrahmsConsumer, + TestConsumer, + AntaresConsumer, + FinkConsumer, + AlerceConsumer, + # PittGoogleBroker + ) # ===================================================================== # To make this die cleanly, send the USR1 signal to it @@ -437,18 +44,17 @@ def __init__( self, *args, **kwargs ): # Make sure the log directory exists - logdir = _rundir.parent.parent.parent / "logs" - if logdir.exists(): - if not logdir.is_dir(): - raise RuntimeError( "{logdir} exists but is not a directory!" ) + if _logdir.exists(): + if not _logdir.is_dir(): + raise RuntimeError( "{_logdir} exists but is not a directory!" ) else: - logdir.mkdir( parents=True ) + _logdir.mkdir( parents=True ) - self.logger = logging.getLogger( "brokerpoll_baselogger" ) + self.logger = logging.getLogger( "elasticc2_brokerpoll_baselogger" ) self.logger.propagate = False - logout = logging.FileHandler( _rundir.parent.parent.parent / f"logs/brokerpoll.log" ) + logout = logging.FileHandler( _logdir / "elasticc2_brokerpoll.log" ) self.logger.addHandler( logout ) - formatter = logging.Formatter( f'[%(asctime)s - brokerpoll - %(levelname)s] - %(message)s', + formatter = logging.Formatter( f'[%(asctime)s - elasticc2 - %(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logout.setFormatter( formatter ) self.logger.setLevel( logging.INFO ) @@ -487,11 +93,9 @@ def launch_broker( self, brokerclass, pipe, **options ): lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGTERM" ) ) signal.signal( signal.SIGUSR1, lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGUSR1" ) ) - consumer = brokerclass( pipe=pipe, **options ) - # Make sure this subprocess has a cassandra connection - # (Search for "connection.unregsiter" and see the comments there.) - # django.db.connections['cassandra'].connection.register() - # Do + consumer = brokerclass( pipe=pipe, + postgres_brokermessage_model=elasticc2.models.BrokerMessage, + **options ) consumer.poll() def handle( self, *args, **options ): @@ -524,17 +128,10 @@ def handle( self, *args, **options ): # We want to make sure that django doesn't send copies of database sessions # to the subprocesses; at least for Cassandra, that breaks things. So, # before launching all the processes, close all the database django connections - # so that each process will open a new one as it needs it + # so that each process will open a new one as it needs it. + # (They already open mongo connections as necessary, and django doesn't muck + # about with mongo, so we don't have to do things for that.) django.db.connections.close_all() - # Unfortunately, the cassandra django engine doesn't actually - # close. Looking at the source code, it looks like we need to - # "unregister" the connection, and then (empirically) we need to - # manually register the connection in each thread. Because I - # figured this out by looking at source code and not - # documentation (which is very sparse), it's entirely possible - # that a future version of the django cassandra engine will - # break this code. - # django.db.connections['cassandra'].connection.unregister() brokers = {} for name,brokerclass in brokerstodo.items(): diff --git a/tom_desc/elasticc2/management/commands/send_elasticc2_alerts.py b/tom_desc/elasticc2/management/commands/send_elasticc2_alerts.py index 724e4fa6..d0faacfe 100644 --- a/tom_desc/elasticc2/management/commands/send_elasticc2_alerts.py +++ b/tom_desc/elasticc2/management/commands/send_elasticc2_alerts.py @@ -119,14 +119,21 @@ def handle( self, *args, **options ): self.logger.warn( f"{self.runningfile} exists, not starting." ) return - starttimefile = pathlib.Path( __file__ ).parent.parent.parent / "static/elasticc2/alertsendstart" - finishedtimefile = pathlib.Path( __file__ ).parent.parent.parent / "static/elasticc2/alertsendfinish" - flushedupdatetimefile = pathlib.Path( __file__ ).parent.parent.parent / "static/elasticc2/alertssentupdate" - flushednumfile = pathlib.Path( __file__ ).parent.parent.parent / "static/elasticc2/alertssent" + staticdir = pathlib.Path(__file__).parent.parent.parent / "static/elasticc2" + if not staticdir.is_dir(): + if staticdir.exists(): + raise FileExistsError( f"{staticdir} exists but is not a directory!" ) + staticdir.mkdir( exist_ok=True, parents=True ) + starttimefile = staticdir / "alertsendstart" + finishedtimefile = staticdir / "alertsendfinish" + flushedupdatetimefile = staticdir / "alertssentupdate" + flushednumfile = staticdir / "alertssent" # ...this doesn't seem to work inside a django management command. # The signals are never caught. # I hate that. I wish there was a way to override it. + # (The USR1 signal *does* seem to work; see hack in brokerpoll2.py, + # and the corresponding dumb-init hack in the tom-brokerpoll.yaml Spin file.) # signal.signal( signal.SIGINT, lambda signum, frame: self.interruptor( signum, frame ) ) # signal.signal( signal.SIGTERM, lambda signum, frame: self.interruptor( signum, frame ) ) diff --git a/tom_desc/fastdb_dev/DataTools.py b/tom_desc/fastdb_dev/DataTools.py index 67605dd2..3db5ec3a 100644 --- a/tom_desc/fastdb_dev/DataTools.py +++ b/tom_desc/fastdb_dev/DataTools.py @@ -1,6 +1,7 @@ import datetime import json import os +import pathlib import psycopg2 from psycopg2.extras import execute_batch from time import sleep @@ -47,7 +48,7 @@ from rest_framework.settings import api_settings _logger = logging.getLogger("fastdb_queries") -_logout = logging.FileHandler("/code/logs/queries.log") +_logout = logging.FileHandler( pathlib.Path( os.getenv('LOGDIR'), "/logs" ) / "fastdb_queries.log" ) _logger.addHandler( _logout ) _formatter = logging.Formatter( f'[%(asctime)s - %(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) diff --git a/tom_desc/db/management/commands/create_cassandra_db.py b/tom_desc/fastdb_dev/management/commands/fastdb_dev_brokerpoll.py similarity index 61% rename from tom_desc/db/management/commands/create_cassandra_db.py rename to tom_desc/fastdb_dev/management/commands/fastdb_dev_brokerpoll.py index a057b955..e683929a 100644 --- a/tom_desc/db/management/commands/create_cassandra_db.py +++ b/tom_desc/fastdb_dev/management/commands/fastdb_dev_brokerpoll.py @@ -1,54 +1,43 @@ +from pymongo import MongoClient import sys +import pathlib +import logging +import fastavro +import json +import multiprocessing +import signal +import time +import confluent_kafka +import io import os -import django.db -import django_cassandra_engine +import re +import traceback +import datetime +import collections +import atexit +import argparse +import urllib -class Command(BaseCommand): - help = 'Create Cassandra database and keyspace' - def handle( self, *args, **options ): - cursor = django.db.connections['cassandra'].cursor() - cursor.execute( - def __init__( self, *args, **kwargs ): - super().__init__( *args, **kwargs ) +class Broker(object): - # Make sure the log directory exists - - logdir = _rundir.parent.parent.parent / "logs" - if logdir.exists(): - if not logdir.is_dir(): - raise RuntimeError( "{logdir} exists but is not a directory!" ) - else: - logdir.mkdir( parents=True ) + def __init__( self, username=None, password=None, *args, **options ): self.logger = logging.getLogger( "brokerpoll_baselogger" ) self.logger.propagate = False - logout = logging.FileHandler( _rundir.parent.parent.parent / f"logs/brokerpoll.log" ) + logout = logging.FileHandler( _logdir / f"logs/brokerpoll.log" ) self.logger.addHandler( logout ) formatter = logging.Formatter( f'[%(asctime)s - brokerpoll - %(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logout.setFormatter( formatter ) - self.logger.setLevel( logging.INFO ) - - def add_arguments( self, parser ): - parser.add_argument( '--do-alerce', action='store_true', default=False, help="Poll from ALERCE" ) - parser.add_argument( '--do-antares', action='store_true', default=False, help="Poll from ANTARES" ) - parser.add_argument( '--do-fink', action='store_true', default=False, help="Poll from FINK" ) - parser.add_argument( '--do-brahms', action='store_true', default=False, - help="Poll from Rob's test kafka server" ) - parser.add_argument( '--brahms-topic', default=None, - help="Topic to poll on brahms (required if --do-brahms is True)" ) - parser.add_argument( '--do-pitt', action='store_true', default=False, help="Poll from PITT-Google" ) - parser.add_argument( '--pitt-topic', default=None, help="Topic name for PITT-Google" ) - parser.add_argument( '--pitt-project', default=None, help="Project name for PITT-Google" ) - parser.add_argument( '--do-test', action='store_true', default=False, - help="Poll from kafka-server:9092 (for testing purposes)" ) - parser.add_argument( '---test-topic', default='classifications', - help="Topic to poll from on kafka-server:9092" ) - parser.add_argument( '-g', '--grouptag', default=None, help="Tag to add to end of kafka group ids" ) - parser.add_argument( '-r', '--reset', default=False, action='store_true', - help='Reset all stream pointers' ) + self.logger.setLevel( logging.DEBUG ) + if options['reset']: + self.reset = options['reset'] + + self.username = username + self.password = password + def sigterm( self, sig="TERM" ): self.logger.warning( f"Got a {sig} signal, trying to die." ) @@ -59,18 +48,16 @@ def launch_broker( self, brokerclass, pipe, **options ): lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGINT" ) ) signal.signal( signal.SIGTERM, lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGTERM" ) ) - signal.signal( signal.SIGUSR1, - lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGUSR1" ) ) consumer = brokerclass( pipe=pipe, **options ) consumer.poll() - def handle( self, *args, **options ): + def broker_poll( self, *args, **options): self.logger.info( "******** brokerpoll starting ***********" ) self.mustdie = False signal.signal( signal.SIGTERM, lambda sig, stack: self.sigterm( "TERM" ) ) signal.signal( signal.SIGINT, lambda sig, stack: self.sigterm( "INT" ) ) - signal.signal( signal.SIGUSR1, lambda sig, stack: self.sigterm( "USR1" ) ) + brokerstodo = {} if options['do_alerce']: @@ -86,16 +73,16 @@ def handle( self, *args, **options ): if options['do_test']: brokerstodo['test'] = TestConsumer if len( brokerstodo ) == 0: - self.logger.error( "Must give at least one broker to listen to." ) - raise RuntimeError( "No brokers given to listen to." ) - - # Launch a process for each broker that will poll that broker indefinitely + print( "Must give at least one broker to listen to." ) brokers = {} + + # Launch a process for each broker that will poll that broker indefinitely + for name,brokerclass in brokerstodo.items(): self.logger.info( f"Launching thread for {name}" ) parentconn, childconn = multiprocessing.Pipe() - proc = multiprocessing.Process( target=lambda: self.launch_broker(brokerclass, childconn, **options) ) + proc = multiprocessing.Process( target=self.launch_broker(brokerclass, childconn, **options) ) proc.start() brokers[name] = { "process": proc, "pipe": parentconn, @@ -131,8 +118,7 @@ def handle( self, *args, **options ): for name, broker in brokers.items(): dt = time.monotonic() - broker['lastheartbeat'] if dt > toolongsilent: - self.logger.error( f"It's been {dt:.0f} seconds since last heartbeat from {name}; " - f"will restart." ) + self.logger.error( f"It's been {dt:.0f} seconds since last heartbeat from {name}; "f"will restart." ) brokerstorestart.add( name ) for torestart in brokerstorestart: @@ -158,3 +144,44 @@ def handle( self, *args, **options ): time.sleep( 20 ) self.logger.warning( "Exiting." ) return + + +if __name__ == '__main__': + + logger = logging.getLogger( "brokerpoll_baselogger" ) + logger.propagate = False + logout = logging.FileHandler( _logdir / f"logs/brokerpoll.log" ) + logger.addHandler( logout ) + formatter = logging.Formatter( f'[%(asctime)s - brokerpoll - %(levelname)s] - %(message)s',datefmt='%Y-%m-%d %H:%M:%S' ) + logout.setFormatter( formatter ) + logger.setLevel( logging.DEBUG ) + + + parser = argparse.ArgumentParser() + + parser.add_argument( '--do-alerce', action='store_true', default=False, help="Poll from ALeRCE" ) + parser.add_argument( '--alerce-topic-pattern', default='^lc_classifier_.*_(\d{4}\d{2}\d{2})$', + help='Regex for matching ALeRCE topics (warning: custom code, see AlerceBroker)' ) + parser.add_argument( '--do-antares', action='store_true', default=False, help="Poll from ANTARES" ) + parser.add_argument( '--antares-topic', default=None, help='Topic name for Antares' ) + parser.add_argument( '--do-fink', action='store_true', default=False, help="Poll from Fink" ) + parser.add_argument( '--fink-topic', default=None, help='Topic name for Fink' ) + parser.add_argument( '--do-brahms', action='store_true', default=False, + help="Poll from Rob's test kafka server" ) + parser.add_argument( '--brahms-topic', default=None, + help="Topic to poll on brahms (required if --do-brahms is True)" ) + parser.add_argument( '--do-pitt', action='store_true', default=False, help="Poll from PITT-Google" ) + parser.add_argument( '--pitt-topic', default=None, help="Topic name for PITT-Google" ) + parser.add_argument( '--pitt-project', default=None, help="Project name for PITT-Google" ) + parser.add_argument( '--do-test', action='store_true', default=False, + help="Poll from kafka-server:9092 (for testing purposes)" ) + parser.add_argument( '--test-topic', default='classifications', + help="Topic to poll from on kafka-server:9092" ) + parser.add_argument( '-g', '--grouptag', default=None, help="Tag to add to end of kafka group ids" ) + parser.add_argument('-r', '--reset', action='store_true', default=False, help='Reset all stream pointers') + + options = vars(parser.parse_args()) + + broker = Broker(**options) + + poll = broker.broker_poll(**options) diff --git a/tom_desc/fastdb_dev/management/commands/load_fastdb.py b/tom_desc/fastdb_dev/management/commands/load_fastdb.py index a369852b..85b4e7c1 100644 --- a/tom_desc/fastdb_dev/management/commands/load_fastdb.py +++ b/tom_desc/fastdb_dev/management/commands/load_fastdb.py @@ -62,7 +62,11 @@ def handle( self, *args, **options ): mongo_username = urllib.parse.quote_plus(os.environ['MONGODB_ALERT_WRITER']) mongo_password = urllib.parse.quote_plus(os.environ['MONGODB_ALERT_WRITER_PASSWORD']) - client = MongoClient("mongodb://%s:%s@fastdbdev-mongodb:27017/?authSource=alerts" %(mongo_username,mongo_password) ) + # mongodb running on port 27017 on host $MONGOHOST; default + # $MONGOHOST to fastdbdev-mongodb for backwards compatibility + # with previous installs + mongohost = os.getenv( 'MONGOHOST', 'fastdbdev-mongodb' ) + client = MongoClient(f"mongodb://{mongo_username}:{mongo_password}@{mongohost}:27017/?authSource=alerts") self.db = client.alerts # Connect to the PPDB From a862a59feead14e3e95cb733be94342a9206e2e1 Mon Sep 17 00:00:00 2001 From: Rob Knop Date: Thu, 31 Oct 2024 16:00:07 -0700 Subject: [PATCH 5/5] Have tests working for fastdb version of alert cycle. Some problems still to be fixed. --- tests/alertcyclefixtures.py | 210 ++++++++++++++++-- tests/docker-compose.yaml | 58 ++++- tests/setup_mongodb.py | 19 ++ tests/test_elasticc2_alertcycle.py | 35 +-- tests/test_fastdb_dev_alertcycle.py | 69 ++++++ .../db/management/commands/_brokerconsumer.py | 33 ++- .../management/commands/brokerpoll2.py | 16 +- .../commands/fastdb_dev_brokerpoll.py | 168 +++++++------- .../management/commands/load_fastdb.py | 209 ++++++++++++----- 9 files changed, 630 insertions(+), 187 deletions(-) create mode 100644 tests/setup_mongodb.py create mode 100644 tests/test_fastdb_dev_alertcycle.py diff --git a/tests/alertcyclefixtures.py b/tests/alertcyclefixtures.py index 5a7e71a0..8135a3a4 100644 --- a/tests/alertcyclefixtures.py +++ b/tests/alertcyclefixtures.py @@ -1,10 +1,12 @@ # IMPORTANT -- running any tests that depend on fixtures in this file # OTHER than alert_cycle_complete requires a completely fresh # environment. After any run of "pytest ...", if you want to run tests -# (e.g. in test_alert_cycle.py) that use these fixtures, kyou have to +# (e.g. in test_alert_cycle.py) that use these fixtures, you have to # completely tear down and rebuild the docker compose environment. This # is because, as noted below, we can't easily clean up the kafka -# server's state, so on a rerun, the server state will be wrong. +# server's state, so on a rerun, the server state will be wrong. (We +# also use that as a reason to be lazy and not clean up the database; +# see the long comment below.) import sys import os @@ -16,12 +18,15 @@ import subprocess import pytest +from pymongo import MongoClient + sys.path.insert( 0, "/tom_desc" ) os.environ["DJANGO_SETTINGS_MODULE"] = "tom_desc.settings" import django django.setup() import elasticc2.models +import fastdb_dev.models import tom_targets.models from tom_client import TomClient @@ -115,7 +120,7 @@ def classifications_300days_exist( alerts_300days ): @pytest.fixture( scope="session" ) -def classifications_300days_ingested( classifications_300days_exist ): +def classifications_300days_elasticc2_ingested( classifications_300days_exist ): # Have to have an additional sleep after the classifications exist, # because brokerpoll itself has a 10s sleep loop time.sleep( 11 ) @@ -124,7 +129,7 @@ def classifications_300days_ingested( classifications_300days_exist ): # file because I can't clean up, and there is hysteresis. Once # later fixtures have run, the tests below would fail, and these # fixtures may be used in more than one test. - + brkmsg = elasticc2.models.BrokerMessage cfer = elasticc2.models.BrokerClassifier bsid = elasticc2.models.BrokerSourceIds @@ -147,11 +152,48 @@ def classifications_300days_ingested( classifications_300days_exist ): assert ( set( [ i.classifiername for i in cfer.objects.all() ] ) == set( [ "NugentClassifier", "RandomSNType" ] ) ) - + yield True - + + @pytest.fixture( scope="session" ) -def update_diasource_300days( classifications_300days_ingested ): +def classifications_300days_fastdb_dev_ingested( classifications_300days_exist ): + # Have to have an additional sleep after the classifications exist, + # because brokerpoll itself has a 10s sleep loop + time.sleep( 11 ) + + # Have to have these tests here rather than in the actual test_* + # file because I can't clean up, and there is hysteresis. Once + # later fixtures have run, the tests below would fail, and these + # fixtures may be used in more than one test. + + host = os.getenv( 'MONGOHOST' ) + username = os.getenv( 'MONGODB_ALERT_READER' ) + password = os.getenv( 'MONGODB_ALERT_READER_PASSWORD' ) + client = MongoClient( f"mongodb://{username}:{password}@{host}:27017/?authSource=alerts" ) + db = client.alerts + + assert 'fakebroker' in db.list_collection_names() + + coll = db.fakebroker + assert coll.count_documents({}) == 1090 + + numprobs = 0 + for msg in coll.find(): + msg = msg['msg'] + assert msg['brokerName'] == 'FakeBroker' + assert msg['classifierName'] in [ 'RandomSNType', 'NugentClassifier' ] + if msg['classifierName'] == 'NugentClassifier': + assert len( msg['classifications'] ) == 1 + assert msg['classifications'][0]['classId'] == 2222 + assert msg['classifications'][0]['probability'] == 1.0 + numprobs += len( msg['classifications'] ) + assert numprobs == 11445 + + yield True + +@pytest.fixture( scope="session" ) +def update_elasticc2_diasource_300days( classifications_300days_elasticc2_ingested ): result = subprocess.run( [ "python", "manage.py", "update_elasticc2_sources" ], cwd="/tom_desc", capture_output=True ) assert result.returncode == 0 @@ -171,10 +213,54 @@ def update_diasource_300days( classifications_300days_ingested ): # assert targ.objects.count() == obj.objects.count() assert src.objects.count() == 545 assert frced.objects.count() == 4242 - + yield True +@pytest.fixture( scope="session" ) +def update_fastdb_dev_diasource_300days( classifications_300days_fastdb_dev_ingested ): + result = subprocess.run( [ "python", "manage.py", "load_fastdb", + "--pv", "test_pv", "--snapshot", "test_ss", + "--tag", "test_ss_tag", + "--brokers", "fakebroker" ], + cwd="/tom_desc", + capture_output=True ) + assert result.returncode == 0 + + lut = fastdb_dev.models.LastUpdateTime + obj = fastdb_dev.models.DiaObject + src = fastdb_dev.models.DiaSource + frced = fastdb_dev.models.DiaForcedSource + cfer = fastdb_dev.models.BrokerClassifier + cification = fastdb_dev.models.BrokerClassification + pver = fastdb_dev.models.ProcessingVersions + ss = fastdb_dev.models.Snapshots + dspvss = fastdb_dev.models.DStoPVtoSS + dfspvss = fastdb_dev.models.DFStoPVtoSS + + assert lut.objects.count() == 1 + assert lut.objects.first().last_update_time > datetime.datetime.fromtimestamp( 0, tz=datetime.timezone.utc ) + assert lut.objects.first().last_update_time < datetime.datetime.now( tz=datetime.timezone.utc ) + + # TODO : right now, load_fastdb.py imports the future -- that is, it imports + # the full ForcedSource lightcure for an object for which we got a source + # the first time that source is seen, and never looks at forcedsources + # again. Update the tests numbers if/when it simulates not knowing the + # future. + # (Really, we should probably creat a whole separate simulated PPDB server with + # an interface that will look something like the real PPDB interface... when + # we actually know what that is.) + + assert obj.objects.count() == 102 + assert src.objects.count() == 545 + assert frced.objects.count() == 15760 # 4242 + assert cfer.objects.count() == 2 + # assert cification.objects.count() == 831 # ???? WHy is this not 545 * 2 ? LOOK INTO THIS + # # ---> seems to be non-deterministic! + # TODO : pver, ss, dpvss, dfspvss + + yield True + @pytest.fixture( scope="session" ) def alerts_100daysmore( alerts_300days ): # This will send alerts up through mjd 60676. Why not 60678, since the previous @@ -197,14 +283,14 @@ def alerts_100daysmore( alerts_300days ): # Same issue as alerts_300days about not cleaning up @pytest.fixture( scope="session" ) -def classifications_100daysmore_ingested( alerts_100daysmore ): +def classifications_100daysmore_elasticc2_ingested( alerts_100daysmore ): # This time we need to allow for both the 10s sleep cycle timeout of # brokerpoll and fakebroker (since we're not checking # classifications exist separately from ingested) time.sleep( 22 ) # Tests here because of hysteresis - + brkmsg = elasticc2.models.BrokerMessage cfer = elasticc2.models.BrokerClassifier @@ -224,10 +310,45 @@ def classifications_100daysmore_ingested( alerts_100daysmore ): == set( [ "NugentClassifier", "RandomSNType" ] ) ) yield True - + @pytest.fixture( scope="session" ) -def update_diasource_100daysmore( classifications_100daysmore_ingested ): +def classifications_100daysmore_fastdb_dev_ingested( alerts_100daysmore ): + # This time we need to allow for both the 10s sleep cycle timeout of + # brokerpoll and fakebroker (since we're not checking + # classifications exist separately from ingested) + time.sleep( 22 ) + + # Tests here because of hysteresis + + host = os.getenv( 'MONGOHOST' ) + username = os.getenv( 'MONGODB_ALERT_READER' ) + password = os.getenv( 'MONGODB_ALERT_READER_PASSWORD' ) + client = MongoClient( f"mongodb://{username}:{password}@{host}:27017/?authSource=alerts" ) + db = client.alerts + + assert 'fakebroker' in db.list_collection_names() + + coll = db.fakebroker + assert coll.count_documents({}) == 1300 + + numprobs = 0 + for msg in coll.find(): + msg = msg['msg'] + assert msg['brokerName'] == 'FakeBroker' + assert msg['classifierName'] in [ 'RandomSNType', 'NugentClassifier' ] + if msg['classifierName'] == 'NugentClassifier': + assert len( msg['classifications'] ) == 1 + assert msg['classifications'][0]['classId'] == 2222 + assert msg['classifications'][0]['probability'] == 1.0 + numprobs += len( msg['classifications'] ) + assert numprobs == 13650 + + yield True + + +@pytest.fixture( scope="session" ) +def update_elasticc2_diasource_100daysmore( classifications_100daysmore_elasticc2_ingested ): result = subprocess.run( [ "python", "manage.py", "update_elasticc2_sources" ], cwd="/tom_desc", capture_output=True ) assert result.returncode == 0 @@ -248,7 +369,52 @@ def update_diasource_100daysmore( classifications_100daysmore_ingested ): assert frced.objects.count() == 5765 yield True - + + +@pytest.fixture( scope="session" ) +def update_fastdb_dev_diasource_100daysmore( classifications_100daysmore_fastdb_dev_ingested ): + # SEE COMMENTS IN update_fastdb_dev_diasource_300days + + result = subprocess.run( [ "python", "manage.py", "load_fastdb", + "--pv", "test_pv", "--snapshot", "test_ss", + "--tag", "test_ss_tag", + "--brokers", "fakebroker" ], + cwd="/tom_desc", + capture_output=True ) + assert result.returncode == 0 + + lut = fastdb_dev.models.LastUpdateTime + obj = fastdb_dev.models.DiaObject + src = fastdb_dev.models.DiaSource + frced = fastdb_dev.models.DiaForcedSource + cfer = fastdb_dev.models.BrokerClassifier + cification = fastdb_dev.models.BrokerClassification + pver = fastdb_dev.models.ProcessingVersions + ss = fastdb_dev.models.Snapshots + dspvss = fastdb_dev.models.DStoPVtoSS + dfspvss = fastdb_dev.models.DFStoPVtoSS + + assert lut.objects.count() == 1 + assert lut.objects.first().last_update_time > datetime.datetime.fromtimestamp( 0, tz=datetime.timezone.utc ) + assert lut.objects.first().last_update_time < datetime.datetime.now( tz=datetime.timezone.utc ) + + # TODO : right now, load_fastdb.py imports the future -- that is, it imports + # the full ForcedSource lightcure for an object for which we got a source + # the first time that source is seen, and never looks at forcedsources + # again. Update the tests numbers if/when it simulates not knowing the + # future. + # (Really, we should probably creat a whole separate simulated PPDB server with + # an interface that will look something like the real PPDB interface... when + # we actually know what that is.) + + assert obj.objects.count() == 131 + assert src.objects.count() == 650 + assert frced.objects.count() == 20834 # 5765 + assert cfer.objects.count() == 2 + # assert cification.objects.count() == ... # ???? WHy is this not 650 * 2 ? LOOK INTO THIS + # TODO : pver, ss, dpvss, dfspvss + + yield True @pytest.fixture( scope="session" ) def api_classify_existing_alerts( alerts_100daysmore, apibroker_client ): @@ -324,18 +490,22 @@ def alert_cycle_complete( request, tomclient ): json={ 'query': 'SELECT COUNT(*) AS count FROM elasticc2_brokermessage' } ) rows = res.json()[ 'rows' ] if rows[0]['count'] == 0: - request.getfixturevalue( "update_diasource_100daysmore" ) + request.getfixturevalue( "update_elasticc2_diasource_100daysmore" ) + request.getfixturevalue( "update_fastdb_dev_diasource_100daysmore" ) request.getfixturevalue( "api_classify_existing_alerts" ) yield True - - + + __all__ = [ 'alerts_300days', 'classifications_300days_exist', - 'classifications_300days_ingested', - 'update_diasource_300days', + 'classifications_300days_elasticc2_ingested', + 'classifications_300days_fastdb_dev_ingested', + 'update_elasticc2_diasource_300days', + 'update_fastdb_dev_diasource_300days', 'alerts_100daysmore', - 'classifications_100daysmore_ingested', - 'update_diasource_100daysmore', + 'classifications_100daysmore_elasticc2_ingested', + 'classifications_100daysmore_fastdb_dev_ingested', + 'update_fastdb_dev_diasource_100daysmore', 'api_classify_existing_alerts', 'alert_cycle_complete' ] diff --git a/tests/docker-compose.yaml b/tests/docker-compose.yaml index dcc9a977..ded0f2f8 100644 --- a/tests/docker-compose.yaml +++ b/tests/docker-compose.yaml @@ -61,6 +61,12 @@ services: timeout: 10s retries: 5 + # TODO: make sure this one fails if any of the commands it runs inside + # the bash shell below fails. (That way, the rest of the compose + # environment won't come up if it depends on this having + # completed_successfully.) (In particular, right now, + # at least if the manage.py migrate fails, it doesn't cause + # the container think it failed to complete successfully.) createdb: depends_on: postgres: @@ -92,6 +98,13 @@ services: DB_USER: postgres DB_PASS: fragile DB_PORT: 5432 + MONGOHOST: mongodb + MONGODB_ADMIN: mongodb_admin + MONGODB_ADMIN_PASSWORD: fragile + MONGODB_ALERT_WRITER: mongodb_alert_writer + MONGODB_ALERT_WRITER_PASSWORD: writer + MONGODB_ALERT_READER: mongdb_alert_reader + MONGODB_ALERT_READER_PASSWORD: reader entrypoint: - /bin/bash - -c @@ -100,6 +113,7 @@ services: python tom_desc/manage.py create_test_superuser python tom_desc/manage.py create_test_apibroker python /tests/create_postgres_ro_user.py + python /tests/setup_mongodb.py tom: @@ -170,7 +184,47 @@ services: DB_USER: postgres DB_PASS: fragile DB_PORT: 5432 - entrypoint: [ "python", "manage.py", "brokerpoll2", "--do-test" ] + entrypoint: [ "python", "manage.py", "brokerpoll2", "--do-test", "--grouptag", "elasticc2" ] + + + brokerpoll_fastdb_dev: + depends_on: + createdb: + condition: service_completed_successfully + tom: + condition: service_started + fakebroker: + condition: service_started + image: registry.nersc.gov/m1727/raknop/tom_desc_bindmount + build: + context: ../ + dockerfile: docker_server/Dockerfile + target: tom-server-bindmount + volumes: + - type: bind + source: ../secrets + target: /secrets + - type: bind + source: ../tom_desc + target: /tom_desc + - type: volume + source: logs + target: /logs + environment: + LOGDIR: /logs + MONGOHOST: mongodb + MONGODB_ADMIN: mongodb_admin + MONGODB_ADMIN_PASSWORD: fragile + MONGODB_ALERT_WRITER: mongodb_alert_writer + MONGODB_ALERT_WRITER_PASSWORD: writer + MONGODB_ALERT_READER: mongdb_alert_reader + MONGODB_ALERT_READER_PASSWORD: reader + DB_NAME: tom_desc + DB_HOST: postgres + DB_USER: postgres + DB_PASS: fragile + DB_PORT: 5432 + entrypoint: [ "python", "manage.py", "fastdb_dev_brokerpoll", "--do-test", "--grouptag", "fastdb_dev" ] # Thought required: want to make this dependent on @@ -191,6 +245,8 @@ services: condition: service_started brokerpoll: condition: service_started + brokerpoll_fastdb_dev: + condition: service_started image: registry.nersc.gov/m1727/raknop/tom_server_bindmount_dev build: context: ../ diff --git a/tests/setup_mongodb.py b/tests/setup_mongodb.py new file mode 100644 index 00000000..57b0809b --- /dev/null +++ b/tests/setup_mongodb.py @@ -0,0 +1,19 @@ +import sys +import os +from pymongo import MongoClient + +host = os.getenv( 'MONGOHOST' ) +user = os.getenv( 'MONGODB_ADMIN' ) +password = os.getenv( 'MONGODB_ADMIN_PASSWORD' ) + +client = MongoClient( f"mongodb://{user}:{password}@{host}:27017/" ) +sys.stderr.write( "Creating mongodb database alerts\n" ) +db = client.alerts + +users = [ os.getenv( "MONGODB_ALERT_WRITER" ), os.getenv( "MONGODB_ALERT_READER" ) ] +passwds = [ os.getenv( "MONGODB_ALERT_WRITER_PASSWORD" ), os.getenv( "MONGODB_ALERT_READER_PASSWORD" ) ] +roleses = [ [ "readWrite" ] , [ "read" ] ] + +for user, passwd, roles in zip( users, passwds, roleses ): + sys.stderr.write( f"Creating mongodb user {user} with password {passwd} and roles {roles}\n" ) + db.command( "createUser", user, pwd=passwd, roles=roles ) diff --git a/tests/test_elasticc2_alertcycle.py b/tests/test_elasticc2_alertcycle.py index da8c65cc..196d9453 100644 --- a/tests/test_elasticc2_alertcycle.py +++ b/tests/test_elasticc2_alertcycle.py @@ -1,3 +1,14 @@ +# WARNING -- if you run both this test and test_fastdb_dev_alertcycle +# within the same docker compose session, but different pytest +# sessions, one will fail. For the reason, see the comments in +# alertcyclefixtures.py. (Basically, the first one you run will load +# up both databases, so early tests that expect not-fully-loaded +# databases will fail.) +# +# Both should all pass if you run them both at once, i.e. +# +# pytest -v test_elasticc2_alertcycle.py test_fastdb_dev_alertcycle.py + import os import sys import datetime @@ -20,11 +31,7 @@ # the tests below. See comments in alercyclefixtures.py for the reason for # this. -class TestAlertCycle: - def test_hello_world( self ): - # This is just here so I can get a timestamp to see how long the next test took - assert True - +class TestElasticc2AlertCycle: def test_ppdb_loaded( self, elasticc2_ppdb ): # I should probably have some better tests than just object counts.... assert elasticc2.models.PPDBDiaObject.objects.count() == 346 @@ -46,20 +53,20 @@ def test_classifications_exist( self, classifications_300days_exist ): assert classifications_300days_exist - def test_classifications_ingested( self, classifications_300days_ingested ): - assert classifications_300days_ingested + def test_classifications_ingested( self, classifications_300days_elasticc2_ingested ): + assert classifications_300days_elasticc2_ingested - def test_sources_updated( self, update_diasource_300days ): - assert update_diasource_300days + def test_sources_updated( self, update_elasticc2_diasource_300days ): + assert update_elasticc2_diasource_300days - def test_100moredays_classifications_ingested( self, classifications_100daysmore_ingested ): - assert classifications_100daysmore_ingested + def test_100moredays_classifications_ingested( self, classifications_100daysmore_elasticc2_ingested ): + assert classifications_100daysmore_elasticc2_ingested - def test_100moredays_sources_updated( self, update_diasource_100daysmore ): - assert update_diasource_100daysmore + def test_100moredays_sources_updated( self, update_elasticc2_diasource_100daysmore ): + assert update_elasticc2_diasource_100daysmore def test_apibroker_existingsources( self, api_classify_existing_alerts ): @@ -79,7 +86,7 @@ def test_apibroker_existingsources( self, api_classify_existing_alerts ): # The test api broker will add 1300 probabilities # (since it assignes probabilities to two classes). # Add that to the 13650 probabilities that - # are in fixture classifications_100daysmore_ingested, + # are in fixture classifications_100daysmore_elasticc2_ingested, # and you get 14950 for msg in brkmsg.objects.all(): assert len(msg.classid) == len(msg.probability) diff --git a/tests/test_fastdb_dev_alertcycle.py b/tests/test_fastdb_dev_alertcycle.py new file mode 100644 index 00000000..ffc8ef85 --- /dev/null +++ b/tests/test_fastdb_dev_alertcycle.py @@ -0,0 +1,69 @@ +# WARNING -- if you run both this test and test_elasticc2_alertcycle +# within the same docker compose session, but different pytest +# sessions, one will fail. For the reason, see the comments in +# alertcyclefixtures.py. (Basically, the first one you run will load +# up both databases, so early tests that expect not-fully-loaded +# databases will fail.) +# +# Both should all pass if you run them both at once, i.e. +# +# pytest -v test_elasticc2_alertcycle.py test_fastdb_dev_alertcycle.py + +import os +import sys +import datetime +import time + +sys.path.insert( 0, "/tom_desc" ) + +import elasticc2.models + +from msgconsumer import MsgConsumer + +# pytest is mysterious. I tried importing just the fixtures I was using +# form alertcyclefixtures, but the a fixture there that used another +# fixture from alertcyclefixtures that I did *not* import here couldn't +# find that other fixture. So, I import *, and use an __all__ in +# alertcyclefixtures. +from alertcyclefixtures import * + +# NOTE -- many of the actual tests are run in the fixtures rather than +# the tests below. See comments in alercyclefixtures.py for the reason for +# this. + +class TestFastDBDevAlertCycle: + def test_ppdb_loaded( self, elasticc2_ppdb ): + # I should probably have some better tests than just object counts.... + assert elasticc2.models.PPDBDiaObject.objects.count() == 346 + assert elasticc2.models.PPDBDiaSource.objects.count() == 1862 + assert elasticc2.models.PPDBAlert.objects.count() == elasticc2.models.PPDBDiaSource.objects.count() + assert elasticc2.models.PPDBDiaForcedSource.objects.count() == 52172 + assert elasticc2.models.DiaObjectTruth.objects.count() == elasticc2.models.PPDBDiaObject.objects.count() + + + def handle_test_send_alerts( self, msgs ): + self._test_send_alerts_count += len(msgs) + + + def test_send_alerts( self, alerts_300days ): + assert alerts_300days + + + def test_classifications_exist( self, classifications_300days_exist ): + assert classifications_300days_exist + + + def test_classifications_ingested( self, classifications_300days_fastdb_dev_ingested ): + assert classifications_300days_fastdb_dev_ingested + + + def test_sources_updated( self, update_fastdb_dev_diasource_300days ): + assert update_fastdb_dev_diasource_300days + + + def test_100moredays_classifications_ingested( self, classifications_100daysmore_fastdb_dev_ingested ): + assert classifications_100daysmore_fastdb_dev_ingested + + + def test_100moredays_sources_updated( self, update_fastdb_dev_diasource_100daysmore ): + assert update_fastdb_dev_diasource_100daysmore diff --git a/tom_desc/db/management/commands/_brokerconsumer.py b/tom_desc/db/management/commands/_brokerconsumer.py index 74f31a2a..dfb4286a 100644 --- a/tom_desc/db/management/commands/_brokerconsumer.py +++ b/tom_desc/db/management/commands/_brokerconsumer.py @@ -15,6 +15,7 @@ # TODO : uncomment this next line # and the whole PittGoogleBroker class # when pittgoogle works again +# from concurrent.futures import ThreadPoolExecutor # for pittgoogle # import pittgoogle _rundir = pathlib.Path(__file__).parent @@ -37,10 +38,11 @@ class BrokerConsumer: Currently may assume that broker messages are coming in the elasticc2 v0.91 schema. """ + _brokername = 'unknown_broker' def __init__( self, server, groupid, topics=None, updatetopics=False, schemaless=True, reset=False, extraconfig={}, - schemafile=None, pipe=None, loggername="BROKER", + schemafile=None, pipe=None, loggername="BROKER", loggername_prefix='', postgres_brokermessage_model=None, mongodb_dbname=None, mongodb_collection=None, **kwargs ): @@ -48,15 +50,15 @@ def __init__( self, server, groupid, topics=None, updatetopics=False, self.logger.propagate = False logout = logging.StreamHandler( sys.stderr ) self.logger.addHandler( logout ) - formatter = logging.Formatter( f'[%(asctime)s - {loggername} - %(levelname)s] - %(message)s', + formatter = logging.Formatter( f'[%(asctime)s - {loggername_prefix}{loggername} - %(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logout.setFormatter( formatter ) # self.logger.setLevel( logging.INFO ) self.logger.setLevel( logging.DEBUG ) - self.countlogger = logging.getLogger( f"countlogger_{loggername}" ) + self.countlogger = logging.getLogger( f"countlogger_{loggername_prefix}{loggername}" ) self.countlogger.propagate = False - _countlogout = logging.FileHandler( _logdir / f"brokerpoll_counts_{loggername}.log" ) + _countlogout = logging.FileHandler( _logdir / f"brokerpoll_counts_{loggername_prefix}{loggername}.log" ) _countformatter = logging.Formatter( f'[%(asctime)s - %(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) _countlogout.setFormatter( _countformatter ) @@ -101,6 +103,11 @@ def __init__( self, server, groupid, topics=None, updatetopics=False, if ( self.mongodb_dbname is None ) != ( self.mongodb_collection is None ): raise ValueError( "Must give either both or neither of mongodb_name and mongodb_collection" ) + if ( ( self.postgres_brokermessage_model is None ) and + ( self.mongodb_dbname is None ) ): + raise ValueError( "Both postgres_brokermessage_model and mongodb_dbname are None; " + "nowhere to save consumed messages!" ) + if self.postgres_brokermessage_model is not None: self.logger.info( f"Writing broker messages to postgres model " f"{self.postgres_brokermessage_model.__name__}" ) @@ -211,8 +218,10 @@ def handle_message_batch( self, msgs ): def mongodb_store(self, messagebatch=None): if messagebatch is None: return 0 - client = MongoClient( f"mongodb://{self.mongousername}:{self.mongopassword}@{self.mongohost}:27017/" - f"?authSource={self.mongodb_dbname}" ) + connstr = ( f"mongodb://{self.mongousername}:{self.mongopassword}@{self.mongohost}:27017/" + f"?authSource={self.mongodb_dbname}" ) + self.logger.debug( f"mongodb connection string {connstr}" ) + client = MongoClient( connstr ) db = getattr( client, self.mongodb_dbname ) collection = db[ self.mongodb_collection ] results = collection.insert_many(messagebatch) @@ -270,6 +279,8 @@ def poll( self, restart_time=datetime.timedelta(minutes=30) ): # I should replace this and the next one with a generic noauth consumer class BrahmsConsumer(BrokerConsumer): + _brokername = 'brahms' + def __init__( self, grouptag=None, brahms_topic=None, loggername="BRAHMS", **kwargs ): if brahms_topic is None: raise RuntimeError( "Must specify brahms topic" ) @@ -283,6 +294,8 @@ def __init__( self, grouptag=None, brahms_topic=None, loggername="BRAHMS", **kwa # This consumer is used in the tests class TestConsumer(BrokerConsumer): + _brokername = 'fakebroker' + def __init__( self, grouptag=None, test_topic=None, loggername="TEST", **kwargs ): if test_topic is None: raise RuntimeError( "Must specify test topic" ) @@ -296,6 +309,8 @@ def __init__( self, grouptag=None, test_topic=None, loggername="TEST", **kwargs # ====================================================================== class AntaresConsumer(BrokerConsumer): + _brokername = 'antares' + def __init__( self, grouptag=None, usernamefile='/secrets/antares_username', passwdfile='/secrets/antares_passwd', loggername="ANTARES", antares_topic='elasticc2-st1-ddf-full', **kwargs ): @@ -327,6 +342,8 @@ def __init__( self, grouptag=None, # ====================================================================== class FinkConsumer(BrokerConsumer): + _brokername = 'fink' + def __init__( self, grouptag=None, loggername="FINK", fink_topic='fink_elasticc-2022fall', **kwargs ): server = "134.158.74.95:24499" groupid = "elasticc-lbnl" + ( "" if grouptag is None else "-" + grouptag ) @@ -340,6 +357,8 @@ def __init__( self, grouptag=None, loggername="FINK", fink_topic='fink_elasticc- # ====================================================================== class AlerceConsumer(BrokerConsumer): + _brokername = 'alerce' + def __init__( self, grouptag=None, usernamefile='/secrets/alerce_username', @@ -386,6 +405,8 @@ def update_topics( self, *args, **kwargs ): # ===================================================================== # class PittGoogleBroker(BrokerConsumer): +# _brokername = 'pitt-google' +# # def __init__( # self, # pitt_topic: str, diff --git a/tom_desc/elasticc2/management/commands/brokerpoll2.py b/tom_desc/elasticc2/management/commands/brokerpoll2.py index ca4fb68a..b6994a0d 100644 --- a/tom_desc/elasticc2/management/commands/brokerpoll2.py +++ b/tom_desc/elasticc2/management/commands/brokerpoll2.py @@ -1,18 +1,11 @@ import sys import os -import io -import re import pathlib import time -import datetime import logging -import traceback import signal -import json import multiprocessing -import fastavro -import confluent_kafka -from concurrent.futures import ThreadPoolExecutor # for pittgoogle + import django.db from django.core.management.base import BaseCommand, CommandError import elasticc2.models @@ -95,11 +88,12 @@ def launch_broker( self, brokerclass, pipe, **options ): lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGUSR1" ) ) consumer = brokerclass( pipe=pipe, postgres_brokermessage_model=elasticc2.models.BrokerMessage, + loggername_prefix='elasticc2_', **options ) consumer.poll() def handle( self, *args, **options ): - self.logger.info( "******** brokerpoll starting ***********" ) + self.logger.info( "******** elasticc2 brokerpoll starting ***********" ) self.mustdie = False signal.signal( signal.SIGTERM, lambda sig, stack: self.sigterm( "TERM" ) ) @@ -122,7 +116,7 @@ def handle( self, *args, **options ): if len( brokerstodo ) == 0: self.logger.error( "Must give at least one broker to listen to." ) raise RuntimeError( "No brokers given to listen to." ) - + # Launch a process for each broker that will poll that broker indefinitely # We want to make sure that django doesn't send copies of database sessions @@ -132,7 +126,7 @@ def handle( self, *args, **options ): # (They already open mongo connections as necessary, and django doesn't muck # about with mongo, so we don't have to do things for that.) django.db.connections.close_all() - + brokers = {} for name,brokerclass in brokerstodo.items(): self.logger.info( f"Launching thread for {name}" ) diff --git a/tom_desc/fastdb_dev/management/commands/fastdb_dev_brokerpoll.py b/tom_desc/fastdb_dev/management/commands/fastdb_dev_brokerpoll.py index e683929a..729583e5 100644 --- a/tom_desc/fastdb_dev/management/commands/fastdb_dev_brokerpoll.py +++ b/tom_desc/fastdb_dev/management/commands/fastdb_dev_brokerpoll.py @@ -1,43 +1,81 @@ -from pymongo import MongoClient import sys +import os import pathlib +import time import logging -import fastavro -import json -import multiprocessing import signal -import time -import confluent_kafka -import io -import os -import re -import traceback -import datetime -import collections -import atexit -import argparse -import urllib +import multiprocessing + +import django.db +from django.core.management.base import BaseCommand, CommandError +import elasticc2.models + +_rundir = pathlib.Path(__file__).parent +_djangodir = _rundir.parent.parent.parent +_logdir = pathlib.Path( os.getenv( 'LOGDIR', '/logs' ) ) + +# Add the db/management/commands directory as we include stuff from there +sys.path.append( str(_rundir.parent.parent.parent / "db/management/commands" ) ) +from _brokerconsumer import ( BrahmsConsumer, + TestConsumer, + AntaresConsumer, + FinkConsumer, + AlerceConsumer, + # PittGoogleBroker + ) +# ===================================================================== +# To make this die cleanly, send the USR1 signal to it +# (SIGTERM doesn't work because django captures that, sadly.) +class Command(BaseCommand): + help = 'Poll ELAsTiCC Brokers, load fastdb' + schemafile = _djangodir / "elasticc2/management/commands/elasticc.v0_9.brokerClassification.avsc" -class Broker(object): + def __init__( self, *args, **kwargs ): + super().__init__( *args, **kwargs ) - def __init__( self, username=None, password=None, *args, **options ): + # Make sure the log directory exists - self.logger = logging.getLogger( "brokerpoll_baselogger" ) + if _logdir.exists(): + if not _logdir.is_dir(): + raise RuntimeError( "{_logdir} exists but is not a directory!" ) + else: + _logdir.mkdir( parents=True ) + + self.logger = logging.getLogger( "fastdb_dev_brokerpoll_baselogger" ) self.logger.propagate = False - logout = logging.FileHandler( _logdir / f"logs/brokerpoll.log" ) + logout = logging.FileHandler( _logdir / "fastdb_dev_brokerpoll.log" ) self.logger.addHandler( logout ) - formatter = logging.Formatter( f'[%(asctime)s - brokerpoll - %(levelname)s] - %(message)s', + formatter = logging.Formatter( f'[%(asctime)s - elasticc2 - %(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logout.setFormatter( formatter ) - self.logger.setLevel( logging.DEBUG ) - if options['reset']: - self.reset = options['reset'] - - self.username = username - self.password = password - + self.logger.setLevel( logging.INFO ) + + def add_arguments( self, parser ): + parser.add_argument( '--do-alerce', action='store_true', default=False, help="Poll from ALeRCE" ) + parser.add_argument( '--alerce-topic-pattern', default='^lc_classifier_.*_(\d{4}\d{2}\d{2})$', + help='Regex for matching ALeRCE topics (warning: custom code, see AlerceBroker)' ) + parser.add_argument( '--do-antares', action='store_true', default=False, help="Poll from ANTARES" ) + parser.add_argument( '--antares-topic', default=None, help='Topic name for Antares' ) + parser.add_argument( '--do-fink', action='store_true', default=False, help="Poll from Fink" ) + parser.add_argument( '--fink-topic', default=None, help='Topic name for Fink' ) + parser.add_argument( '--do-brahms', action='store_true', default=False, + help="Poll from Rob's test kafka server" ) + parser.add_argument( '--brahms-topic', default=None, + help="Topic to poll on brahms (required if --do-brahms is True)" ) + parser.add_argument( '--do-pitt', action='store_true', default=False, help="Poll from PITT-Google" ) + parser.add_argument( '--pitt-topic', default=None, help="Topic name for PITT-Google" ) + parser.add_argument( '--pitt-project', default=None, help="Project name for PITT-Google" ) + parser.add_argument( '--do-test', action='store_true', default=False, + help="Poll from kafka-server:9092 (for testing purposes)" ) + parser.add_argument( '---test-topic', default='classifications', + help="Topic to poll from on kafka-server:9092" ) + parser.add_argument( '-g', '--grouptag', default=None, help="Tag to add to end of kafka group ids" ) + parser.add_argument( '-r', '--reset', default=False, action='store_true', + help='Reset all stream pointers' ) + parser.add_argument( '-m', '--mongodb-dbname', default='alerts', + help="Name of the database on $MONGOHOST to write alerts to (default: alerts)" ) def sigterm( self, sig="TERM" ): self.logger.warning( f"Got a {sig} signal, trying to die." ) @@ -48,16 +86,21 @@ def launch_broker( self, brokerclass, pipe, **options ): lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGINT" ) ) signal.signal( signal.SIGTERM, lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGTERM" ) ) - consumer = brokerclass( pipe=pipe, **options ) + signal.signal( signal.SIGUSR1, + lambda sig, stack: self.logger.warning( f"{brokerclass.__name__} ignoring SIGUSR1" ) ) + consumer = brokerclass( pipe=pipe, + loggername_prefix='fastdb_dev_', + mongodb_collection=brokerclass._brokername, + **options ) consumer.poll() - def broker_poll( self, *args, **options): - self.logger.info( "******** brokerpoll starting ***********" ) + def handle( self, *args, **options ): + self.logger.info( "******** fastdb_dev brokerpoll starting ***********" ) self.mustdie = False signal.signal( signal.SIGTERM, lambda sig, stack: self.sigterm( "TERM" ) ) signal.signal( signal.SIGINT, lambda sig, stack: self.sigterm( "INT" ) ) - + signal.signal( signal.SIGUSR1, lambda sig, stack: self.sigterm( "USR1" ) ) brokerstodo = {} if options['do_alerce']: @@ -73,16 +116,24 @@ def broker_poll( self, *args, **options): if options['do_test']: brokerstodo['test'] = TestConsumer if len( brokerstodo ) == 0: - print( "Must give at least one broker to listen to." ) - - brokers = {} + self.logger.error( "Must give at least one broker to listen to." ) + raise RuntimeError( "No brokers given to listen to." ) # Launch a process for each broker that will poll that broker indefinitely + # We want to make sure that django doesn't send copies of database sessions + # to the subprocesses; at least for Cassandra, that breaks things. So, + # before launching all the processes, close all the database django connections + # so that each process will open a new one as it needs it. + # (They already open mongo connections as necessary, and django doesn't muck + # about with mongo, so we don't have to do things for that.) + django.db.connections.close_all() + + brokers = {} for name,brokerclass in brokerstodo.items(): self.logger.info( f"Launching thread for {name}" ) parentconn, childconn = multiprocessing.Pipe() - proc = multiprocessing.Process( target=self.launch_broker(brokerclass, childconn, **options) ) + proc = multiprocessing.Process( target=lambda: self.launch_broker(brokerclass, childconn, **options) ) proc.start() brokers[name] = { "process": proc, "pipe": parentconn, @@ -118,7 +169,8 @@ def broker_poll( self, *args, **options): for name, broker in brokers.items(): dt = time.monotonic() - broker['lastheartbeat'] if dt > toolongsilent: - self.logger.error( f"It's been {dt:.0f} seconds since last heartbeat from {name}; "f"will restart." ) + self.logger.error( f"It's been {dt:.0f} seconds since last heartbeat from {name}; " + f"will restart." ) brokerstorestart.add( name ) for torestart in brokerstorestart: @@ -128,7 +180,7 @@ def broker_poll( self, *args, **options): del brokers[torestart] parentconn, childconn = multiprocessing.Pipe() proc = multiprocessing.Process( target=lambda: self.launch_broker( brokerstodo[torestart], - childconn, **options ) ) + childconn, **options ) ) proc.start() brokers[torestart] = { "process": proc, "pipe": parentconn, @@ -138,50 +190,10 @@ def broker_poll( self, *args, **options): self.mustdie = True # I chose 20s since kubernetes sends a TERM and then waits 30s before shutting things down + # (Note that the Pitt-Google consumer doesn't handle this message.) self.logger.warning( "Shutting down. Sending die to all processes and waiting 20s" ) for name, broker in brokers.items(): broker['pipe'].send( { "command": "die" } ) time.sleep( 20 ) self.logger.warning( "Exiting." ) return - - -if __name__ == '__main__': - - logger = logging.getLogger( "brokerpoll_baselogger" ) - logger.propagate = False - logout = logging.FileHandler( _logdir / f"logs/brokerpoll.log" ) - logger.addHandler( logout ) - formatter = logging.Formatter( f'[%(asctime)s - brokerpoll - %(levelname)s] - %(message)s',datefmt='%Y-%m-%d %H:%M:%S' ) - logout.setFormatter( formatter ) - logger.setLevel( logging.DEBUG ) - - - parser = argparse.ArgumentParser() - - parser.add_argument( '--do-alerce', action='store_true', default=False, help="Poll from ALeRCE" ) - parser.add_argument( '--alerce-topic-pattern', default='^lc_classifier_.*_(\d{4}\d{2}\d{2})$', - help='Regex for matching ALeRCE topics (warning: custom code, see AlerceBroker)' ) - parser.add_argument( '--do-antares', action='store_true', default=False, help="Poll from ANTARES" ) - parser.add_argument( '--antares-topic', default=None, help='Topic name for Antares' ) - parser.add_argument( '--do-fink', action='store_true', default=False, help="Poll from Fink" ) - parser.add_argument( '--fink-topic', default=None, help='Topic name for Fink' ) - parser.add_argument( '--do-brahms', action='store_true', default=False, - help="Poll from Rob's test kafka server" ) - parser.add_argument( '--brahms-topic', default=None, - help="Topic to poll on brahms (required if --do-brahms is True)" ) - parser.add_argument( '--do-pitt', action='store_true', default=False, help="Poll from PITT-Google" ) - parser.add_argument( '--pitt-topic', default=None, help="Topic name for PITT-Google" ) - parser.add_argument( '--pitt-project', default=None, help="Project name for PITT-Google" ) - parser.add_argument( '--do-test', action='store_true', default=False, - help="Poll from kafka-server:9092 (for testing purposes)" ) - parser.add_argument( '--test-topic', default='classifications', - help="Topic to poll from on kafka-server:9092" ) - parser.add_argument( '-g', '--grouptag', default=None, help="Tag to add to end of kafka group ids" ) - parser.add_argument('-r', '--reset', action='store_true', default=False, help='Reset all stream pointers') - - options = vars(parser.parse_args()) - - broker = Broker(**options) - - poll = broker.broker_poll(**options) diff --git a/tom_desc/fastdb_dev/management/commands/load_fastdb.py b/tom_desc/fastdb_dev/management/commands/load_fastdb.py index 85b4e7c1..11ce529f 100644 --- a/tom_desc/fastdb_dev/management/commands/load_fastdb.py +++ b/tom_desc/fastdb_dev/management/commands/load_fastdb.py @@ -8,6 +8,7 @@ from django.core.management.base import BaseCommand, CommandError import signal import datetime +import pytz from datetime import timezone import time from psycopg2.extras import execute_values @@ -19,27 +20,26 @@ import pprint import urllib.parse import os -from fastdb_dev.models import LastUpdateTime, ProcessingVersions, HostGalaxy, Snapshots, DiaObject, DiaSource, DiaForcedSource +from fastdb_dev.models import LastUpdateTime, ProcessingVersions, HostGalaxy, Snapshots +from fastdb_dev.models import DiaObject, DiaSource, DiaForcedSource from fastdb_dev.models import DStoPVtoSS, DFStoPVtoSS, BrokerClassifier, BrokerClassification from django.core.exceptions import ObjectDoesNotExist -_rundir = pathlib.Path(__file__).parent -print(_rundir) -sys.path.insert(0, str(_rundir) ) - +_logdir = pathlib.Path( os.getenv( 'LOGDIR', '/logs' ) ) class Command(BaseCommand): help = 'Store alerts in FASTDB' def __init__( self, *args, **kwargs ): super().__init__( *args, **kwargs ) - self.logger = logging.getLogger( "FASTDB log" ) + self.logger = logging.getLogger( "load_fastdb" ) self.logger.propagate = False - logout = logging.FileHandler( _rundir.parent.parent.parent.parent / f"code/logs/fastdb.log" ) + logout = logging.FileHandler( _logdir / f"load_fastdb.log" ) self.logger.addHandler( logout ) formatter = logging.Formatter( f'[%(asctime)s - fastdb - %(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logout.setFormatter( formatter ) + # self.logger.setLevel( logging.INFO ) self.logger.setLevel( logging.DEBUG ) def add_arguments( self, parser ): @@ -51,9 +51,16 @@ def add_arguments( self, parser ): def handle( self, *args, **options ): - mongodb_collections = {'alerce':'alerce','antares':'antares','fink':'fink','ztf':'ztf', 'test':'test'} - brokerNames = {'test':'FakeBroker'} - self.logger.info( "********fastdb starting ***********" ) + mongodb_collections = { 'alerce': 'alerce', + 'antares': 'antares', + 'fink': 'fink', + 'ztf': 'ztf', + 'test': 'test', + 'fakebroker': 'fakebroker' } + brokerNames = { 'test': 'FakeBroker', + 'fakebroker': 'FakeBroker' } + + self.logger.info( "********load_fastdb starting ***********" ) season = options['season'] snapshot = options['snapshot'] @@ -68,9 +75,9 @@ def handle( self, *args, **options ): mongohost = os.getenv( 'MONGOHOST', 'fastdbdev-mongodb' ) client = MongoClient(f"mongodb://{mongo_username}:{mongo_password}@{mongohost}:27017/?authSource=alerts") self.db = client.alerts - + # Connect to the PPDB - + # Get password ppdb_name = os.environ['DB_NAME'] @@ -79,18 +86,26 @@ def handle( self, *args, **options ): ppdb_password = os.environ['DB_PASS'] conn_string = "host='%s' dbname='%s' user='%s' password='%s'" % (ppdb_host,ppdb_name,ppdb_user,ppdb_password) conn = psycopg2.connect(conn_string) - + cursor = conn.cursor() self.logger.info("Connected to PPDB") # Get last update time - lst = LastUpdateTime.objects.latest('last_update_time') - last_update_time = lst.last_update_time - self.logger.info(last_update_time) - print(last_update_time) - + try: + lst = LastUpdateTime.objects.latest('last_update_time') + last_update_time = lst.last_update_time + except Exception as ex: + # Probably just means there's no last update time in the database yet, + # So set last_update_time to the beginning of time. (Well, the Unix epoch. Same thing.) + last_update_time = datetime.datetime.fromtimestamp( 0, tz=datetime.timezone.utc ) + lst = LastUpdateTime( last_update_time=last_update_time ) + lst.save() + + self.logger.info( f"Last update time: {last_update_time}" ) + # print(last_update_time) + current_datetime = datetime.datetime.now(tz=datetime.timezone.utc) #current_datetime = datetime.datetime(2023,4,30,0,0,0,tzinfo=timezone.utc) @@ -112,21 +127,46 @@ def handle( self, *args, **options ): else: self.logger.error("Current date isn't gte than start date of Processing Version") exit - + # get all the alerts that pass at least one of the SN criteria with probability > 0.1 since last_update_time + # (TODO: make this 0.1 a passable option?) + # (TODO: also need to make the list of types we care about a passable option.) # Loop over the brokers that were passed in via the argument list brokerstodo = options['brokers'] print(brokerstodo) list_diaSourceId = [] - + for name in brokerstodo: - print(brokerNames[name]) + self.logger.debug( f"Doing broker {name} {f'(brokerNames[name])' if name in brokerNames else ''}" ) collection = self.db[mongodb_collections[name]] - results = collection.find({"$and":[{"msg.brokerName":brokerNames[name]},{"timestamp":{'$gte':last_update_time, '$lt':current_datetime}},{"msg.classifications":{'$elemMatch':{'$and':[{"classId":{'$in':[2222,2223,2224,2225,2226,2242,2232]}},{"probability":{'$gte':0.1}}]}}}]}) - + # Is the 'timestamp' set by when the document is added to the database, or is it set by + # the broker when it sent the message? If the latter, then there could be trouble here: + # it's possible that we'll ingest messages *after* this script runs that are timestamped + # _before_ current_datetime, and then those messages will never get processed. (If 'timestamp' + # is when it's loaded into the database, then we should be safer.) + results = collection.find( {"$and": [ {"msg.brokerName": brokerNames[name]}, + {"timestamp": {'$gte':last_update_time, '$lt':current_datetime}}, + {"msg.classifications": { + '$elemMatch':{ + '$and':[ {"classId": {'$in':[2222,2223,2224,2225, + 2226,2242,2232]}}, + {"probability": {'$gte':0.1}} ] + } + } + } + ] } ) + + # TODO: we're probably going to want to have a bulk load here. + # (For elasticc2, I needed to bulk load broker messages to avoid + # getting totally killed by overhead.) + # + # TODO : cache known broker ids so we don't have to + # read that table for every row of results. + # (Does django do internal caching?) + for r in results: diaSource_id = r['msg']['diaSourceId'] alert_id = r['msg']['alertId'] @@ -135,18 +175,41 @@ def handle( self, *args, **options ): bc.dia_source = r['msg']['diaSourceId'] bc.topic_name = r['topic'] bc.desc_ingest_timestamp = datetime.datetime.now(tz=datetime.timezone.utc) - bc.broker_ingest_timestamp = r['timestamp'] + timestamp = r['timestamp'] + if not isinstance( timestamp, datetime.datetime ): + raise TypeError( f"r['timestamp'] is a {type(timestamp)}, expected datetime.datetime" ) + if timestamp.tzinfo is None: + timestamp = pytz.utc.localize( timestamp ) + bc.broker_ingest_timestamp = timestamp + broker_version = r['msg']['brokerVersion'] broker_name = r['msg']['brokerName'] classifier_name = r['msg']['classifierName'] - broker_classifier = BrokerClassifier.objects.get(broker_name=broker_name, broker_version=broker_version, classifier_name=classifier_name) + classifier_params = r['msg']['classifierParams'] + broker_classifier = BrokerClassifier.objects.filter( broker_name=broker_name, + broker_version=broker_version, + classifier_name=classifier_name, + classifier_params=classifier_params ) + if broker_classifier.count() > 1: + raise ValueError( "Database corruption error! Broker classifier shows up more than once!" ) + if broker_classifier.count() == 0: + # Broker classifier doesn't exist yet, create it + broker_classifier = BrokerClassifier( broker_name=broker_name, + broker_version=broker_version, + classifier_name=classifier_name, + classifier_params=classifier_params ) + broker_classifier.save() + else: + broker_classifier = broker_classifier.first() + bc.classifier = broker_classifier.classifier_id # Local copy of classifier to circumvent Django Foreign key rules bc.classifications = r['msg']['classifications'] bc.save() - + list_diaSourceId.append(diaSource_id) + # Get unique set of source Ids across all broker alerts uniqueSourceId = set(list_diaSourceId) @@ -156,19 +219,36 @@ def handle( self, *args, **options ): #columns = diaSourceId,diaObjectId,psFlux,psFluxSigma,midPointTai,ra,decl,snr,filterName,observeDate + # TODO: another place we may well want a bulk loader. More complicated here + # because right now, the code as is does both adding and updating, whereas + # with bulk stuff that's probably thornier. + # (Hopefully for the real PPDB it will be possible to send a list of source ids and + # get all the information at once.) for d in uniqueSourceId: - self.logger.info("Source Id %d" % d) - query = sql.SQL( "SELECT * FROM {} where {} = %s").format(sql.Identifier('elasticc2_ppdbdiasource'),sql.Identifier('diasource_id')) - self.logger.info(query.as_string(conn)) - + # self.logger.debug("Source Id %d" % d) + query = ( sql.SQL( "SELECT * FROM {} where {} = %s") + .format(sql.Identifier('elasticc2_ppdbdiasource'),sql.Identifier('diasource_id')) ) + # self.logger.debug(query.as_string(conn)) + cursor.execute(query,(d,)) - self.logger.info("Count = %d" % cursor.rowcount) - if cursor.rowcount != 0: + if cursor.rowcount == 0: + self.logger.error( f"source {d} not known in PPDB!" ) + else: + if cursor.rowcount > 1: + self.logger.error( f"source {d} multiply defined in PPDB! This should be impossible." ) + result = cursor.fetchone() # Store this new Source in the FASTDB + # THOUGHT REQUIRED : the source could well already + # exist. Brokers can (and will) send classsifications + # for sources that were already classififed by other + # brokers in a previous run. Does this code handle + # updating as well as inserting? If not, we have to do + # that explicitly just as with objectds below. + ds = DiaSource(dia_source=result[0]) ds.season = season ds.filter_name = result[2] @@ -178,32 +258,34 @@ def handle( self, *args, **options ): ds.ps_flux_err = result[6] ds.snr = result[7] ds.mid_point_tai = result[1] - + # Count how many brokers alerted on this Source Id ds.broker_count = list_diaSourceId.count(d) + + # TODO: shouldn't this only be set if the source doesn't already exist? ds.insert_time = datetime.datetime.now(tz=datetime.timezone.utc) diaObjectId = result[8] - self.logger.info("Dia Object Id = %s" % diaObjectId) - + # Now look to see whether we already have this DiaObject in FASTDB - + try: do = DiaObject.objects.get(pk=diaObjectId) - + # Update number of observations - + do.nobs +=1 do.save() - + except ObjectDoesNotExist: - - self.logger.info("DiaObject not in FASTDB. Create new entry.") - + + self.logger.info( f"DiaObject {diaObjectId} not in FASTDB. Create new entry.") + # Fetch the DiaObject from the PPDB - - query = sql.SQL("SELECT * from {} where {} = %s").format(sql.Identifier('elasticc2_ppdbdiaobject'),sql.Identifier('diaobject_id')) - + + query = ( sql.SQL("SELECT * from {} where {} = %s") + .format(sql.Identifier('elasticc2_ppdbdiaobject'),sql.Identifier('diaobject_id')) ) + cursor.execute(query,(diaObjectId,)) if cursor.rowcount != 0: result = cursor.fetchone() @@ -217,12 +299,12 @@ def handle( self, *args, **options ): do.ra_dec_tai = ds.mid_point_tai do.nobs = 1 do.insert_time = datetime.datetime.now(tz=datetime.timezone.utc) - + # locate Host Galaxies in Data release DB Object table # There is information in the PPDB for the 3 closest objects. Is this good enough? # Where to get them in season 1? - - + + do.save() @@ -242,20 +324,34 @@ def handle( self, *args, **options ): dspvss.insert_time = datetime.datetime.now(tz=datetime.timezone.utc) dspvss.save() - + # Look to see if there any ForcedSource entries for this object # Now look to see whether we already have any ForcedSource in FASTDB - + dfs = DiaForcedSource.objects.filter(dia_object_id=diaObjectId) - self.logger.info(len(dfs)) - if len(dfs) == 0: + if len(dfs) == 0: # diaForcedSourceId,diaObjectId,psFlux,psFluxSigma,filterName,observeDate - query = sql.SQL("SELECT * from {} where {} = %s").format(sql.Identifier('elasticc2_ppdbdiaforcedsource'),sql.Identifier('diaobject_id')) + # TODO NOTE : at least for elasticc2, where the "ppbd", we may + # want to put in a check that the midpointtai of the forced source is not newer + # than the midpointtai of the source in question. Right now, it imports the + # future, as the elasticc2 ppdb tables have all sources and forced sources + # that either have been or will be detected. + # + # (elasticc2/management/commands/update_elasticc2_sources.py does this) + # + # This also relevant for real LSST. When we get more sources on a pre-existing + # object, we want to get any new forced source photometry for that object. + # If we only look for forced sources if the object doesn't already have any, + # then we will only get forced sources from times before the first source + # we load. + query = ( sql.SQL("SELECT * from {} where {} = %s") + .format(sql.Identifier('elasticc2_ppdbdiaforcedsource'),sql.Identifier('diaobject_id')) ) cursor.execute(query,(diaObjectId,)) if cursor.rowcount != 0: results = cursor.fetchall() + self.logger.debug( f"Loading {len(results)} forced sources for object {diaObjectId}" ) for r in results: dfs = DiaForcedSource(dia_forced_source=r[0]) dfs.dia_force_source = r[0] @@ -268,14 +364,13 @@ def handle( self, *args, **options ): dfs.mid_point_tai = r[1] dfs.insert_time = datetime.datetime.now(tz=datetime.timezone.utc) dfs.processing_version = processing_version - self.logger.info("Forced Source Id %d" % r[0]) dfs.save() dfspvss = DFStoPVtoSS(dia_forced_source=r[0]) dfspvss.processing_version = processing_version dfspvss.snapshot_name = snapshot dfspvss.insert_time = datetime.datetime.now(tz=datetime.timezone.utc) - + dfspvss.save() @@ -283,7 +378,7 @@ def handle( self, *args, **options ): # Store last_update_time lst.last_update_time = current_datetime lst.save() - + cursor.close() conn.close() - +