Add scripts that fetch external content

WebOfTrust · Jul 12, 2024 · fa74d47 · fa74d47
1 parent 6fbfa62
commit fa74d47
Show file tree

Hide file tree

Showing 24 changed files with 1,455 additions and 0 deletions.
diff --git a/fetchExternalContent/fetchAnnotatedCopies/README.md b/fetchExternalContent/fetchAnnotatedCopies/README.md
@@ -0,0 +1,5 @@
+# Order in which to run the scripts:
+
+1. fetchExternalContentMetaData.js
+2. fetchExternalContent.js
+3. addHTMLstructureToExternalContent.js
diff --git a/fetchExternalContent/fetchAnnotatedCopies/addHTMLstructureToExternalContent.js b/fetchExternalContent/fetchAnnotatedCopies/addHTMLstructureToExternalContent.js
@@ -0,0 +1,132 @@
+#!/usr/bin/env node
+
+/*
+  Author: Kor Dwarshuis
+  Created: 2023
+  Updated: -
+  Description:
+
+    Markdown to Bootstrap Accordion Converter
+    
+    This script automates the conversion of Markdown files in the directoryPath directory into Bootstrap accordion format.
+    It imports a JSON file named 'externalContentMetaData.json' to create a mapping of anchor tags to 'Level' attributes, which 
+    are then used as data attributes in the generated Bootstrap accordions.
+    
+    Features:
+    1. Reads all Markdown (.md) files in the specified directory.
+    2. Imports 'Level' attributes from an external JSON file.
+    3. Converts all headings in the Markdown files to H2.
+    4. Wraps sections under H2 headings in Bootstrap accordion divs, utilizing the imported 'Level' as a data attribute.
+    5. Writes the updated content back into each Markdown file.
+    
+    Dependencies: 
+    - Node.js built-in modules: 'fs' for file system operations, 'path' for path manipulations.
+    
+    Logging:
+    Outputs a log message for each successfully updated file.
+ */
+
+
+
+
+const fs = require('fs');
+const path = require('path');
+require('dotenv').config();
+
+// Directory path
+const directoryPath = process.env.ANNOTATED_COPIES_OUTPUT_DIR;
+
+// Import external JSON object TODO: fix the way the path is constructed
+const externalContentMetaData = require(path.join(__dirname, '../.' + process.env.ANNOTATED_COPIES_INPUT_DIR));
+
+
+// Create mapping from the imported JSON
+let dataAttributeMap = {};
+externalContentMetaData.values.slice(1).forEach(row => {
+    let anchor = row[5];
+    if (anchor) {
+        // Remove everything before the last "#"
+        anchor = anchor.split("#").pop().toLowerCase().replace(/\s/g, '-');
+        dataAttributeMap[anchor] = row[11]; // using 'Level' as the data attribute
+    }
+});
+
+
+fs.readdir(directoryPath, (err, files) => {
+    if (err) {
+        return console.log('Unable to scan directory: ' + err);
+    }
+
+    // Process all .md files
+    files.filter(file => path.extname(file) === '.md').forEach(file => {
+        const markdownFilePath = path.join(directoryPath, file);
+
+        fs.readFile(markdownFilePath, 'utf8', (err, data) => {
+            if (err) {
+                console.error(`Failed to read file ${file}:`, err);
+                return;
+            }
+
+            // Replace all headings with H2
+            let updatedData = data.replace(/^(#{1,6}) (.*$)/gm, '## $2');
+
+            // Wrap H2 sections in divs with data-attributes
+            updatedData = updatedData.split(/\n(?=## )/g).map(section => {
+                let match = section.match(/## (.*)$/m);
+                let heading = match ? match[1] : null;
+                let anchor = heading ? heading.toLowerCase() : Math.floor(Math.random() * 10000000000000).toString();
+                anchor = anchor
+                    .replace(/\s/g, '-')
+                    .replace(/&/g, '-')
+                    .replace(/\//g, '-')
+                    .replace(/\\/g, '-')
+                    .replace(/</g, '-')
+                    .replace(/>/g, '-')
+                    .replace(/\(/g, '-')
+                    .replace(/\)/g, '-')
+                    .replace(/'/g, '-')
+                    .replace(/`/g, '-')
+                    .replace(/,/g, '-')
+                    .replace(/\./g, '-')
+                    .replace(/;/g, '-')
+                    .replace(/:/g, '-')
+                    .replace(/\?/g, '-')
+                    .replace(/\?/g, '-')
+                    .replace(/!/g, '-')
+                    .replace(/"/g, '-')
+                    ;
+                let dataAttribute = dataAttributeMap[anchor] || '1';
+
+                // Creating Bootstrap Accordion
+                // the “\n\n” must be added or the code will fail
+                return `
+                    \n\n<div className="accordion-item" data-level="${dataAttribute}">
+                        \n\n<h2 className="accordion-header" id="header${anchor}">
+                        \n\n<button className="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#accordeon-${anchor}" aria-expanded="false" aria-controls="accordeon-${anchor}">
+                            ${anchor}, level ${dataAttribute}
+                        \n\n</button>
+                        \n\n</h2>
+                        \n\n<div id="accordeon-${anchor}" className="accordion-collapse collapse">
+                        \n\n<div className="accordion-body">
+                            \n\n${section}
+                        \n\n</div>
+                        \n\n</div>
+                    \n\n</div>
+                `;
+            }).join('\n');
+
+            // Wrap all content in a div with the accordion className
+            updatedData = `<div className="accordion accordion-flush" id="annotated-copies">` + updatedData + `</div>`;
+
+            // Write to the file
+            fs.writeFile(markdownFilePath, updatedData, (err) => {
+                if (err) {
+                    console.error(`Failed to write to file ${file}:`, err);
+                    return;
+                }
+
+                console.log(`Successfully updated markdown file: ${file}`);
+            });
+        });
+    });
+});
diff --git a/fetchExternalContent/fetchAnnotatedCopies/fetchExternalContent.js b/fetchExternalContent/fetchAnnotatedCopies/fetchExternalContent.js
@@ -0,0 +1,144 @@
+#!/usr/bin/env node
+
+/*
+  Author: Kor Dwarshuis
+  Created: 2023
+  Updated: -
+  Description: 
+
+    This script consumes the data produced by the 'fetchExternalContentMetaData.js' script.
+    
+    This script performs the following tasks:
+    1. Reads the 'externalContentMetaData.json' file located in the './static/json/' directory to obtain a list of URLs.
+    2. Downloads Markdown files (.md) from the URLs and stores them in the outputFileLocation directory.
+    3. Cleans up the downloaded Markdown files by:
+       - Replacing Markdown links without URLs.
+       - Removing the first line if it contains "---".
+       Configuration:
+    - `inputFileLocation`: Directory and filename where the JSON file containing URLs is located.
+    - `outputFileLocation`: Directory where the downloaded files will be stored.
+       The code utilizes Node.js and its 'fs', 'path', and 'https' modules to read files, manage directories, and download content.
+    Promises are used for asynchronous operations.
+
+*/
+
+const fs = require('fs');
+const https = require('https');
+require('dotenv').config();
+
+// Config
+const inputFileLocation = process.env.ANNOTATED_COPIES_INPUT_DIR;
+const outputFileLocation = process.env.ANNOTATED_COPIES_OUTPUT_DIR; // Where to copy the files to
+// End Config
+
+// Create the output directory if it doesn't exist
+if (!fs.existsSync(outputFileLocation)) {
+    fs.mkdirSync(outputFileLocation, { recursive: true });
+}
+
+function readFileAsync(filePath) {
+    return new Promise((resolve, reject) => {
+        fs.readFile(filePath, 'utf8', (err, data) => {
+            if (err) {
+                reject(err);
+                return;
+            }
+
+            try {
+                const inputData = JSON.parse(data);
+                resolve(inputData);
+            } catch (err) {
+                reject(err);
+            }
+        });
+    });
+}
+
+function processJSON(json) {
+    // Used for naming the downloaded file: Remove the protocol from the URL, this is done to ensure that the file name is valid (no colons, slashes, etc.)
+    function removeProtocol(inputString) {
+        if (inputString.startsWith("https://")) {
+            inputString = inputString.substring(8);
+        } else if (inputString.startsWith("http://")) {
+            inputString = inputString.substring(7);
+        }
+        let transformedString = inputString.replace('raw.githubusercontent.com/', '');
+        transformedString = transformedString.replace(/\//g, "-");
+        return transformedString;
+    }
+
+    json.values.forEach((item, index) => {
+        if (item[1] === 'Source') return;// First row is the header
+        if (item[1] === '') return;// Skip rows when there is no URL
+        if (item[1] === undefined) return; // Skip rows when there is no URL
+        const transformedUrl = removeProtocol(item[1]);
+
+        // only copy markdown files
+        if (!item[1].endsWith(".md")) { return; }
+        downloadFile(item[1], outputFileLocation + transformedUrl);
+    });
+}
+
+function downloadFile(url, destination) {
+    return new Promise((resolve, reject) => {
+        const file = fs.createWriteStream(destination);
+
+        https.get(url, response => {
+            response.pipe(file);
+
+            file.on('finish', () => {
+                file.close();
+                resolve();
+                cleanUpFile(destination);
+            });
+        }).on('error', error => {
+            fs.unlink(destination, () => {
+                reject(error);
+            });
+        });
+    });
+}
+
+function cleanUpFile(filePath) {
+    fs.readFile(filePath, 'utf8', (err, data) => {
+        if (err) {
+            console.error('Error reading file:', err);
+            return;
+        }
+
+        let updatedContent = data;
+
+        // Check and replace Markdown links without URLs
+        const regex = /\[([^\]]+)\]\(\)/g;
+        updatedContent = updatedContent.replace(regex, '$1');
+
+        // Check and remove first line if it's "---"
+        const lines = updatedContent.split('\n');
+        if (lines[0] === '---') {
+            lines.shift(); // Remove the first line
+            updatedContent = lines.join('\n');
+        }
+
+        if (data !== updatedContent) {
+            fs.writeFile(filePath, updatedContent, 'utf8', (err) => {
+                if (err) {
+                    console.error('Error saving file:', err);
+                } else {
+                    console.log('File updated successfully.');
+                }
+            });
+        } else {
+            console.log('No changes required. File remains unchanged.');
+        }
+    });
+
+}
+
+readFileAsync(inputFileLocation)
+    .then((input) => {
+        processJSON(input);
+    })
+    .catch((err) => {
+        console.error('Error reading file:', err);
+    });
+
diff --git a/fetchExternalContent/fetchAnnotatedCopies/fetchExternalContentMetaData.js b/fetchExternalContent/fetchAnnotatedCopies/fetchExternalContentMetaData.js
@@ -0,0 +1,82 @@
+#!/usr/bin/env node
+
+/*
+  Author: Kor Dwarshuis
+  Created: 2023
+  Updated: -
+  Description:
+  
+    This script creates the data for the fetchExternalContent.js script.
+
+    This Node.js script performs the following tasks:
+    1. Sends an HTTP GET request to a Google Sheets API endpoint (“WOT-terms” Google Sheet, tab “LabelContent”) to fetch JSON-formatted data (see https://sheets.googleapis.com/v4/spreadsheets/18IUa-1NSJ_8Tz_2D-VSuSQa_yf3ES1s_hovitm3Clvc/values/LabelContent?alt=json&key=AIzaSyCA4sOfLTriHKjaQftREYWMnQNokDHf_tM).
+       - The URL of the Google Sheet API endpoint is hardcoded within the script.
+    2. Receives and accumulates the JSON data in chunks as it is streamed from the Google Sheet API.
+    3. Once all data is received, it writes the JSON data to a file named 'externalContentMetaData.json' in the './static/json/' directory.
+    
+    Configuration:
+    - `outputDirJSON`: Directory where the output JSON file will be stored.
+    - `outputFileNameJSON`: Name of the output JSON file.
+    
+    Note: 
+    - The script should be run from the root of the project.
+    - For information on how to create a JSON endpoint from a Google Sheet, refer to https://stackoverflow.com/a68854199
+    
+    The code uses the Node.js 'fs', 'path', and 'https' modules to manage directories, write files, and perform HTTPS   GET requests.
+
+*/
+
+const fs = require('fs');
+const path = require('path');
+const https = require('https');
+require('dotenv').config();
+
+// Config
+const outputDirJSON = './static/json/'; //TODO: find a better place for this file
+const outputFileNameJSON = 'externalContentMetaData.json';
+// End Config
+
+
+// How to create JSON endpoint from Google Sheet: https://stackoverflow.com/a/68854199
+// const url =
+//   'https://sheets.googleapis.com/v4/spreadsheets/18IUa-1NSJ_8Tz_2D-VSuSQa_yf3ES1s_hovitm3Clvc/values/LabelContentTempCopy?alt=json&key=AIzaSyCA4sOfLTriHKjaQftREYWMnQNokDHf_tM';
+const url = process.env.ANNOTATED_COPIES_JSON_ENDPOINT;
+
+https
+  .get(url, (resp) => {
+    let data = '';
+
+    // A chunk of data has been received.
+    resp.on('data', (chunk) => {
+      data += chunk;
+    });
+
+    // The whole response has been received. Print out the result.
+    resp.on('end', () => {
+      writeJSONFile(data);
+    });
+  })
+  .on('error', (err) => {
+    console.log('Error: ' + err.message);
+  });
+
+function writeJSONFile(content) {
+  // Create the output directory if it doesn't exist
+  if (!fs.existsSync(outputDirJSON)) {
+    fs.mkdirSync(outputDirJSON, { recursive: true });
+  }
+
+  // Path to the output file
+  const filePath = path.join(outputDirJSON, outputFileNameJSON);
+
+  fs.writeFile(
+    filePath,
+    content,
+    function (err) {
+      if (err) {
+        return console.log(err);
+      }
+      console.log('JSON file has been written successfully.');
+    }
+  );
+} // End writeJSONFile
diff --git a/fetchExternalContent/fetchAnnotatedCopies/main.sh b/fetchExternalContent/fetchAnnotatedCopies/main.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Run the first script
+node fetchExternalContent/fetchAnnotatedCopies/fetchExternalContentMetaData.js
+
+# Run the second script
+node fetchExternalContent/fetchAnnotatedCopies/fetchExternalContent.js
+
+# Run the third script
+node fetchExternalContent/fetchAnnotatedCopies/addHTMLstructureToExternalContent.js