dmwm · amanrique1 · Oct 17, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/DMOps/file_invalidation_tool/.gitignore b/DMOps/file_invalidation_tool/.gitignore
@@ -0,0 +1 @@
+secrets/*
diff --git a/DMOps/file_invalidation_tool/Dockerfile b/DMOps/file_invalidation_tool/Dockerfile
@@ -0,0 +1,34 @@
+FROM registry.cern.ch/cmsmonitoring/cmsmon-spark:v0.5.0.1
+
+# Set environment variables
+ENV PYCURL_SSL_LIBRARY=nss \
+    X509_USER_CERT=/certs/usercert.pem \
+    X509_USER_KEY=/certs/userkey.pem \
+    RUCIO_CONFIG=/cvmfs/cms.cern.ch/rucio/rucio.cfg \
+    RUCIO_ACCOUNT=transfer_ops \
+    DRIVERPORT=5001 \
+    BMPORT=5002 \
+    UIPORT=5003
+
+# Install dependencies for Rucio, DBS and Gfal
+
+RUN dnf install -y libcurl-devel openssl-devel libffi-devel ca-policy-egi-core \
+    && dnf install -y gfal2-all python3-gfal2 python3-gfal2-util \
+    && dnf install -y cmake gfal2-devel libcurl-devel \
+    && dnf -y groupinstall "Development Tools" || true \
+    && pip3 install cx-Oracle SQLAlchemy==1.4.49 dbs3-client rucio-clients \
+    && pip3 install --compile --global-option="--with-nss" --no-cache-dir pycurl
+
+# Expose ports
+EXPOSE 5001
+EXPOSE 5002
+EXPOSE 5003
+
+# Copy code
+COPY ./src /src/
+
+# Set working directory
+WORKDIR /src
+
+# Set entrypoint
+ENTRYPOINT ["python3", "run_invalidations.py"]
diff --git a/DMOps/file_invalidation_tool/input_examples/checksum_val.csv b/DMOps/file_invalidation_tool/input_examples/checksum_val.csv
@@ -0,0 +1,16 @@
+FILENAME,RSE_EXPRESSION
+/store/mc/RunIISummer20UL18MiniAODv2/GluGluToBulkGravitonToHHTo2B2Tau_M-400_TuneCP5_PSWeights_narrow_13TeV-madgraph-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/2520000/06DDDBA7-B312-2E46-A22A-C76AF34D192C.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ZH_HToBB_ZToQQ_M-125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/2430000/EC177528-23FC-FB4E-9010-473CA26570AC.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ZHToTauTau_M125_CP5_13TeV-powheg-pythia8_ext1/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/80000/EF503A5C-98E7-434C-9DD9-75CE61BDEFAB.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/HWplusJ_HToWW_M-125_TuneCP5_13TeV-powheg-jhugen727-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/70000/591EF774-ADD7-B741-9074-93F25C3667BC.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/HWplusJ_HToWW_M-125_TuneCP5_13TeV-powheg-jhugen727-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/70000/51474CA2-DAF8-0C43-B360-58D0F14F4A1D.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/QCD_HT300to500_TuneCP5_PSWeights_13TeV-madgraph-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/30000/979ACFAB-ED97-3A41-A809-EBF77A5146DA.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/QCD_Pt-470To600_MuEnrichedPt5_TuneCP5_13TeV-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/260000/91B1AB52-B822-8541-B016-5232BF621F98.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/250000/890C5D33-E0F6-094C-8510-8DDC40D2B742.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/250000/1694A7CA-F3DA-8140-B01D-98B828E5EB20.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/250000/89F8B638-7285-AC44-9DFF-B78C3177E6EE.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/250000/BF5A2984-E146-8843-B60F-E5C29888FBBE.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/250000/6330270D-F058-F840-AF9A-F69530956178.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/250000/7729B11E-A1FE-E845-8741-36CAF547AC02.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/250000/4DAFAEEE-245F-CA42-A6B0-EFAF2FDD464B.root,rse_type=DISK
+/store/mc/RunIISummer20UL18MiniAODv2/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/MINIAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/250000/D61AE12C-A68A-754A-B90D-44688705DD4A.root,rse_type=DISK
diff --git a/DMOps/file_invalidation_tool/input_examples/dids.txt b/DMOps/file_invalidation_tool/input_examples/dids.txt
@@ -0,0 +1 @@
+/DMsimp_t-S3D_uR_JChiChi_Mphi-2000_Mchi-350_Lambda-1p0_TuneCUETP8M1_13TeV-madgraph_pythia8/RunIIFall17NanoAODv7-PU2017_12Apr2018_Nano02Apr2020_102X_mc2017_realistic_v8-v1/NANOAODSIM#1ecf1b2a-df7e-4c8d-82e1-2bc0b49d5271
diff --git a/DMOps/file_invalidation_tool/readme.md b/DMOps/file_invalidation_tool/readme.md
@@ -0,0 +1,120 @@
+# File Invalidation Tool for Rucio and DBS
+## Overview
+
+This guide outlines the steps to run the file invalidation tool for Rucio and DBS using Docker image. The tool assists in invalidating specific files, datasets or containers within these systems, to ensure data consistency. Additionally, it has a running mode to check the integrity of files in a given RSE(checksum validation), and invalidate the corrupted replicas. Finally, the tool can also be used to invalidate all files in a given site.
+
+## Prerequisites, Folder Structure and tool input
+
+### Tool Input
+
+The tool has 5 running modes.  It's important that your cert and key (decrypted) have enough permissions to invalidate on DBS and declare replicas as corrupted on Rucio, additionally to this it they require the following inputs and parameters:
+
+| Running Mode | Description | Tool Mode | Input File | Params | Auth Requirements |
+| ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |
+| Global Invalidation | Invalidate all files from received files, datasets or containers list on Rucio and DBS | `global` | `<filename>.txt`: txt file containing list of files, datasets or containers | `--user <kerberos_username>`: Kerberos user for spark jobs<br>`--reason <reason>`: comment for invalidation<br>`--dry-run`(**optional**): Simulate the execution without actually performing the file invalidation<br>`--erase-mode`(**optional**): Erase empty DIDs | `./certs/usercert.pem`<br>`./certs/userkey.pem`<br>`./secrets/<user>.keytab`|
+| DBS Invalidation | Invalidate all files from received files, datasets or containers list only on DBS | `only-dbs` | `<filename>.txt`: txt file containing list of files, datasets or containers |`--user <kerberos_username>`: Kerberos user for spark jobs<br>`--reason <reason>`: comment for invalidation<br>`--dry-run`(**optional**): Simulate the execution without actually performing the file invalidation<br>`--erase-mode`(**optional**): Erase empty DIDs | `./certs/usercert.pem`<br>`./certs/userkey.pem`<br>`./secrets/<user>.keytab`|
+| Rucio Invalidation | Invalidate all files from received files, datasets or containers list only on Rucio | `only-rucio` | `<filename>.txt`: txt file containing list of files, datasets or containers | `--user <kerberos_username>`: Kerberos user for spark jobs<br>`--reason <reason>`: comment for invalidation<br>`--dry-run`(**optional**): Simulate the execution without actually performing the file invalidation<br>`--erase-mode`(**optional**): Erase empty DIDs | `./certs/usercert.pem`<br>`./certs/userkey.pem`<br>`./secrets/<user>.keytab`|
+| Integrity Validation | Validate integrity of files in the given RSE | `integrity-validation` | `<filename>.csv`: csv file containing list of files and RSE [FILENAME,RSE_EXPRESSION] | `--dry-run`(**optional**): Simulate the execution without actually performing the file invalidation in case of being corrupted | `./certs/usercert.pem`<br>`./certs/userkey.pem`|
+| Site Invalidation | Invalidate in Rucio all files from received list at a specific site | `site-invalidation` | `<filename>.txt`: txt file containing list of files, datasets or containers | `--user <kerberos_username>`: Kerberos user for spark jobs<br>`--rse <rse>`: RSE to invalidate at<br>`--reason <reason>`: comment for invalidation<br>`--dry-run`(**optional**): Simulate the execution without actually performing the file invalidation | `./certs/usercert.pem`<br>`./certs/userkey.pem`<br>`./secrets/<user>.keytab`|
+
+> **Note:** The userkey.pem should be decrypted.
+
+??? Example
+    **USERKEY decryption**
+    `openssl rsa -in <encrypted_userkey> -out userkey.pem`
+
+    You would be asked to enter the password.
+
+??? Info
+    **Checksum Validation Mode**
+
+    Some files could be heavy and may lead to exceed your lxplus quota. In case of seeing this error move your working directory to `/eos/user/<first_username_letter>/<username>/` directory.
+    ```Bash
+    gfal-copy error: 122 (Disk quota exceeded) - errno reported by local system call Disk quota exceeded
+    ```
+
+### Environment
+
+This script is thought to be run on **lxplus** or CERN server with access to `registry.cern.ch` and `/cvmfs/` directory.
+
+Setting all together, the working directory structure can change a bit, but it should look like this:
+working_directory/
+├── dids.txt / replicas_validation.csv
+├── certs/
+│   ├── usercert.pem
+│   └── userkey.pem
+
+## Run File Invalidation tool
+
+### 1. CERN Registry Authentication
+
+1. Visit [cern registry](https://registry.cern.ch/).
+2. Login via OIDC Provider.
+3. Click on your username located in the top right.
+4. Click on **User Profile**
+5. Copy the **CLI Secret**, it will be used in the next step.
+
+### 2. Login into CERN Registry
+```Bash
+docker login registry.cern.ch -u <username>
+```
+- `docker login`: Logs in to the Docker registry.
+- `registry.cern.ch`: CERN registry URL.
+- `-u <username>`: CERN registry username.
+
+It will ask you to enter your password. **Enter your CLI Secret.**
+
+### 3. Run the container
+
+```Bash
+docker run -P \
+  -v "$(pwd)/<input_file>:/input/<input_file>" \
+  -v "$(pwd)/certs:/certs" \
+  [-v "$(pwd)/secrets:/secrets" \]
+  --mount type=bind,source=/cvmfs/,target=/cvmfs/,readonly \
+  --network host --rm registry.cern.ch/cmsrucio/file_invalidation_tool [Tool_Mode_Options]
+```
+- `docker run`: Executes a Docker container.
+- `-P`: Publishes all exposed ports to the host interfaces.
+- Volumes mounted:
+  - `-v "$(pwd)/<input_file>:/input/<input_file>"`: Mounts the containers_inv.txt file from the host to /input/dids.txt within the container.
+  - `-v "$(pwd)/certs:/certs"`: Mounts the certs directory from the host to /certs within the container. It must contain the usercert.pem and userkey.pem.
+  - `-v "$(pwd)/secrets:/secrets"`: Mounts the secrets directory from the host to /secrets within the container. It must contain the keytab file.
+  - `--mount type=bind,source=/cvmfs/,target=/cvmfs/,readonly`: Binds the /cvmfs/ directory on the host as read-only within the container.
+- `--network host`: Uses the host's network stack within the container.
+- `--rm`: Automatically removes the container when it exits.
+- `registry.cern.ch/cmsrucio/file_invalidation_tool`: Name of the Docker image to run.
+
+??? Example
+
+    ```Bash
+    docker run -P \
+      -v "$(pwd)/<input_file>:/input/<input_file>.txt" \
+      -v "$(pwd)/certs:/certs" \
+      -v "$(pwd)/secrets:/secrets" \
+      --mount type=bind,source=/cvmfs/,target=/cvmfs/,readonly \
+      --network host --rm registry.cern.ch/cmsrucio/file_invalidation_tool [global | only-dbs | only-rucio] --user <username> --reason <reason>
+    ```
+
+    ```Bash
+    docker run -P \
+      -v "$(pwd)/<input_file>:/input/<input_file>.csv" \
+      -v "$(pwd)/certs:/certs" \
+      --mount type=bind,source=/cvmfs/,target=/cvmfs/,readonly \
+      --network host --rm registry.cern.ch/cmsrucio/file_invalidation_tool integrity-validation
+    ```
+
+    ```Bash
+    docker run -P \
+      -v "$(pwd)/<input_file>:/input/<input_file>.txt" \
+      -v "$(pwd)/certs:/certs" \
+      -v "$(pwd)/secrets:/secrets" \
+      --mount type=bind,source=/cvmfs/,target=/cvmfs/,readonly \
+      --network host --rm registry.cern.ch/cmsrucio/file_invalidation_tool site-invalidation --rse <rse> --user <username>  --reason <reason>
+    ```
+## Additional Notes
+
+- The tool's output will provide details about the invalidation process.
+- User Authorization: Ensure you have the necessary permissions to invalidate on DBS.
+  - The provided certificates will be used for DBS invalidation, in case of authorization errors, rucio invalidation will not be executed.
+  - Rucio Invalidation will be done using the  the dmtops certificate and transfer_ops account since many users will not have permissions to develop this operation.
diff --git a/DMOps/file_invalidation_tool/src/container_invalidation_spark.py b/DMOps/file_invalidation_tool/src/container_invalidation_spark.py
@@ -0,0 +1,50 @@
+from pyspark.sql.functions import col, collect_list, concat_ws
+import click as click
+from CMSSpark.spark_utils import get_spark_session
+from hadoop_queries import get_df_rse_locks, get_df_rse_replicas, get_df_contents
+from pyspark.sql.window import Window
+
+@click.command()
+@click.option('--filename', required=True, default=None, type=str,
+              help='Name of the text file having the datasets names')
+@click.option('--rse', required=False, default=None, type=str,
+              help='RSE to look at')
+def invalidate_containers(filename,rse):
+    spark = get_spark_session(app_name='global_containers_invalidation')
+
+    #Read the containers to delete
+    filename = f'/user/dmtops/{filename}'
+    df_delete = spark.read.text(filename)
+    df_delete = df_delete.withColumnRenamed('value','CONTAINER')
+
+    #Get the basic df
+    df_locks = get_df_rse_locks(spark)
+    df_replicas = get_df_rse_replicas(spark,rse)
+    df_contents = get_df_contents(spark).alias('co')
+
+    #Get the content of the containers to delete (content includes filename, dataset and container)
+    df_delete = df_delete.join(df_contents,df_delete.CONTAINER==df_contents.CONTAINER,how='inner').select(['co.*']).alias('de')
+
+    #Replicas to declare as bad
+    df_delete = df_delete.join(df_replicas,df_delete.FILENAME==df_replicas.NAME,how='inner').select(['de.*','RSE','REPLICA_STATE']).alias('de')
+
+    #Rules protecting the replicas
+    df_delete = df_delete.join(df_locks,(df_delete.FILENAME==df_locks.NAME) & (df_delete.RSE == df_locks.RSE),how='left').select(['de.*','RULE_ID']).alias('de')
+    df_delete.cache()
+
+    #Files to invalidate on DBS
+    df_delete.select('FILENAME').distinct().toPandas().to_csv('/input/dbs_files_inv.txt',index=False, header = False)
+
+    windowSpec = Window.partitionBy('FILENAME')
+    df_delete.withColumn("RSES", collect_list(col("RSE")).over(windowSpec)) \
+        .select(['FILENAME','RSES']).withColumn("RSES", concat_ws(";", "RSES")).distinct().toPandas().to_csv('/input/rucio_replicas_inv.csv',index=False)
+
+    #Replicas to erase from Rucio
+    df_delete.select('DATASET').distinct().toPandas().to_csv('/input/datasets_inv.txt',index=False,header=False)
+
+    #RSE is exported in case it's tape and require purge_replicas
+    df_delete.filter(col('RULE_ID').isNotNull()).select(['RULE_ID','RSE']).distinct()\
+        .toPandas().to_csv('/input/rucio_rules_delete.csv',index=False)
+
+if __name__ == "__main__":
+    invalidate_containers()
diff --git a/DMOps/file_invalidation_tool/src/dataset_invalidation_spark.py b/DMOps/file_invalidation_tool/src/dataset_invalidation_spark.py
@@ -0,0 +1,54 @@
+import click as click
+from CMSSpark.spark_utils import get_spark_session
+from pyspark.sql.functions import col, collect_list, concat_ws
+from hadoop_queries import get_df_rse_locks, get_df_rse_replicas, get_df_contents, get_df_dataset_level_rules
+from pyspark.sql.window import Window
+
+@click.command()
+@click.option('--filename', required=True, default=None, type=str,
+              help='Name of the text file having the datasets names')
+@click.option('--rse', required=False, default=None, type=str,
+              help='RSE to look at')
+def invalidate_datasets(filename,rse):
+    spark = get_spark_session(app_name='global_dataset_invalidation')
+
+    #Read the containers to delete
+    filename = f'/user/dmtops/{filename}'
+    df_delete = spark.read.text(filename)
+    df_delete = df_delete.withColumnRenamed('value','DATASET')
+
+    #Get the basic df
+    df_locks = get_df_rse_locks(spark)
+    df_replicas = get_df_rse_replicas(spark,rse)
+    df_contents = get_df_contents(spark).alias('co')
+    df_rules = get_df_dataset_level_rules(spark).alias('ru')
+
+    #Get the content of the datasets to delete (content includes filename, dataset and container)
+    df_delete = df_delete.join(df_contents,df_delete.DATASET==df_contents.DATASET,how='inner').select(['co.*']).alias('de')
+
+    #Replicas to declare as bad
+    df_delete = df_delete.join(df_replicas,df_delete.FILENAME==df_replicas.NAME,how='inner').select(['de.*','RSE','REPLICA_STATE']).alias('de')
+
+    #Rules protecting the replicas
+    df_delete = df_delete.join(df_locks,(df_delete.FILENAME==df_locks.NAME) & (df_delete.RSE == df_locks.RSE),how='left')\
+        .withColumnRenamed('RULE_ID','LOCK_RULE_ID').select(['de.*','LOCK_RULE_ID'])
+
+    #Rules protecting the datasets  or  children files
+    df_delete = df_delete.join(df_rules,  df_delete.LOCK_RULE_ID ==  df_rules.ID, how='left')\
+            .withColumnRenamed('ID','RULE_ID').select(['de.*', 'RULE_ID']).alias('de')
+    df_delete.cache()
+
+    #Files to invalidate on DBS
+    df_delete.select('FILENAME').distinct().toPandas().to_csv('/input/dbs_files_inv.txt',index=False, header = False)
+
+    #Replicas to invalidate on Rucio
+    windowSpec = Window.partitionBy('FILENAME')
+    df_delete.withColumn("RSES", collect_list(col("RSE")).over(windowSpec)) \
+        .select(['FILENAME','RSES']).withColumn("RSES", concat_ws(";", "RSES")).distinct().toPandas().to_csv('/input/rucio_replicas_inv.csv',index=False)
+
+    #RSE is exported in case it's tape and require purge_replicas
+    df_delete.filter(col('RULE_ID').isNotNull()).select(['RULE_ID','RSE']).distinct()\
+        .toPandas().to_csv('/input/rucio_rules_delete.csv',index=False)
+
+if __name__ == "__main__":
+    invalidate_datasets()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		/DMsimp_t-S3D_uR_JChiChi_Mphi-2000_Mchi-350_Lambda-1p0_TuneCUETP8M1_13TeV-madgraph_pythia8/RunIIFall17NanoAODv7-PU2017_12Apr2018_Nano02Apr2020_102X_mc2017_realistic_v8-v1/NANOAODSIM#1ecf1b2a-df7e-4c8d-82e1-2bc0b49d5271