diff --git a/package.json b/package.json index be2b805b4..5ca8c08ea 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "lint:fix": "eslint --fix src/**/*.{js,json} tests/**/*.js scripts/**/*.js && jsonlint -isV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -is --trim-trailing-commas --enforce-double-quotes ./src/categories.json", "validate": "yarn run lint && node ./scripts/validate.js", "test": "jest", - "upload": "node ./scripts/upload_technology.js", + "upload": "node ./scripts/bigquery_upload.js", "convert": "node --no-warnings ./scripts/convert.js", "build": "yarn run validate && yarn run convert && node ./scripts/build.js" }, diff --git a/scripts/upload_technology.js b/scripts/bigquery_upload.js similarity index 80% rename from scripts/upload_technology.js rename to scripts/bigquery_upload.js index b7057aea6..a8f864bbb 100644 --- a/scripts/upload_technology.js +++ b/scripts/bigquery_upload.js @@ -5,52 +5,12 @@ const fs = require('fs') const path = require('path') const { BigQuery } = require('@google-cloud/bigquery') -const readJsonFiles = (directory) => { - const files = fs.readdirSync(directory) - return files.reduce((mergedData, file) => { - const filePath = path.join(directory, file) - const data = fs.readFileSync(filePath, 'utf8') - return { ...mergedData, ...JSON.parse(data) } - }, {}) -} - -const getArray = (value) => - typeof value === 'string' ? [value] : Array.isArray(value) ? value : [] - -const getRuleObject = (value) => { - if (typeof value === 'string') { - return [{ name: value, value: null }] - } - if (Array.isArray(value)) { - return value.map((key) => ({ name: key, value: null })) - } - if (typeof value === 'object') { - return Object.keys(value).map((key) => ({ - name: key, - value: - typeof value[key] === 'object' - ? JSON.stringify(value[key]) - : value[key].toString(), - })) - } - return [] -} - -const loadToBigQuery = async ( - data, - tableName = 'apps', - datasetName = 'wappalyzer', - writeDisposition = 'WRITE_TRUNCATE', - sourceFormat = 'NEWLINE_DELIMITED_JSON' -) => { - if (!data) { - throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`) - } +const bigquery = new BigQuery({ + keyFilename: '/tmp/gcp_key.json', +}) - const bigquery = new BigQuery({ - keyFilename: '/tmp/gcp_key.json', - }) - const schema = { +const schemas = { + technologies: { fields: [ { name: 'name', type: 'STRING' }, { name: 'categories', type: 'STRING', mode: 'REPEATED' }, @@ -137,8 +97,58 @@ const loadToBigQuery = async ( { name: 'script', type: 'STRING', mode: 'REPEATED' }, { name: 'html', type: 'STRING', mode: 'REPEATED' }, ], + }, + categories: { + fields: [ + { name: 'name', type: 'STRING' }, + { name: 'description', type: 'STRING' }, + ], + }, +} + +const readJsonFiles = (directory) => { + const files = fs.readdirSync(directory) + return files.reduce((mergedData, file) => { + const filePath = path.join(directory, file) + const data = fs.readFileSync(filePath, 'utf8') + return { ...mergedData, ...JSON.parse(data) } + }, {}) +} + +const getArray = (value) => + typeof value === 'string' ? [value] : Array.isArray(value) ? value.sort() : [] + +const getRuleObject = (value) => { + if (typeof value === 'string') { + return [{ name: value, value: null }] + } + if (Array.isArray(value)) { + return value.map((key) => ({ name: key, value: null })) + } + if (typeof value === 'object') { + return Object.keys(value).map((key) => ({ + name: key, + value: + typeof value[key] === 'object' + ? JSON.stringify(value[key]) + : value[key].toString(), + })) + } + return [] +} + +const loadToBigQuery = async ( + data, + tableName = 'technologies', + datasetName = 'wappalyzer', + writeDisposition = 'WRITE_TRUNCATE', + sourceFormat = 'NEWLINE_DELIMITED_JSON' +) => { + if (!data) { + throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`) } + const schema = schemas[tableName] const options = { schema, sourceFormat, writeDisposition } const [job] = await bigquery .dataset(datasetName) @@ -147,11 +157,11 @@ const loadToBigQuery = async ( if (job.status.errors && job.status.errors.length > 0) { console.error('Errors encountered:', job.status.errors) - throw new Error('Error loading data into BigQuery') + throw new Error(`Error loading data into ${datasetName}.${tableName}`) } console.log( - `Loaded ${job.numRowsLoaded} rows into ${datasetName}.${tableName}...` + `Loaded ${job.statistics.load.outputRows} rows into ${datasetName}.${tableName}` ) } @@ -164,9 +174,9 @@ const main = async () => { const transformedTechnologies = Object.keys(technologies).map((key) => { const app = { name: key, - categories: technologies[key].cats.map( - (category) => categories[category].name - ), + categories: technologies[key].cats + .map((category) => categories[category].name) + .sort(), } ;[ @@ -208,13 +218,23 @@ const main = async () => { const transformedTechnologiesJsonL = transformedTechnologies .map((line) => JSON.stringify(line)) .join('\n') - const filePath = './transformedTechnologies.jsonl' - fs.writeFileSync(filePath, transformedTechnologiesJsonL) - - await loadToBigQuery(filePath, 'apps') - - // cleanup file - fs.unlinkSync(filePath) + const technologiesFilePath = './transformedTechnologies.jsonl' + fs.writeFileSync(technologiesFilePath, transformedTechnologiesJsonL) + await loadToBigQuery(technologiesFilePath, 'technologies') + fs.unlinkSync(technologiesFilePath) + + const transformedCategoriesJsonL = Object.values(categories) + .map((value) => + JSON.stringify({ + name: value.name, + description: value.description, + }) + ) + .join('\n') + const categoriesFilePath = './transformedCategories.jsonl' + fs.writeFileSync(categoriesFilePath, transformedCategoriesJsonL) + await loadToBigQuery(categoriesFilePath, 'categories') + fs.unlinkSync(categoriesFilePath) } main().catch(console.error) diff --git a/src/categories.json b/src/categories.json index cfe41a635..36a1c9205 100644 --- a/src/categories.json +++ b/src/categories.json @@ -1,5 +1,6 @@ { "1": { + "description": "Platforms used to create, manage, and modify content on a website without needing specialized technical knowledge", "groups": [ 3 ], @@ -7,6 +8,7 @@ "priority": 1 }, "2": { + "description": "Online platforms where users can post and discuss topics, fostering online communities", "groups": [ 3, 4, @@ -16,6 +18,7 @@ "priority": 1 }, "3": { + "description": "Software applications used to store, retrieve, and manage data", "groups": [ 5 ], @@ -23,6 +26,7 @@ "priority": 2 }, "4": { + "description": "Tools and platforms to host information about products, services, or software", "groups": [ 3 ], @@ -30,6 +34,7 @@ "priority": 2 }, "5": { + "description": "Small, self-contained applications embedded into a webpage", "groups": [ 6 ], @@ -37,6 +42,7 @@ "priority": 9 }, "6": { + "description": "Platforms providing the infrastructure to sell products and services online, managing everything from product catalogs to payments", "groups": [ 1 ], @@ -44,6 +50,7 @@ "priority": 1 }, "7": { + "description": "Platforms used to display images on a website", "groups": [ 3, 10 @@ -52,6 +59,7 @@ "priority": 1 }, "8": { + "description": "Collaborative websites that allow users to create and edit pages", "groups": [ 3 ], @@ -59,6 +67,7 @@ "priority": 1 }, "9": { + "description": "Interfaces for managing web servers and hosting environments", "groups": [ 5, 7 @@ -67,6 +76,7 @@ "priority": 2 }, "10": { + "description": "Tools that track user behavior and provide insights into website performance", "groups": [ 8 ], @@ -74,6 +84,7 @@ "priority": 9 }, "11": { + "description": "Platforms that allow users to publish and manage articles and posts", "groups": [ 3 ], @@ -81,6 +92,7 @@ "priority": 1 }, "12": { + "description": "Used for building dynamic web applications and user interfaces", "groups": [ 9 ], @@ -88,6 +100,7 @@ "priority": 8 }, "13": { + "description": "Used to manage and track bugs, tasks, and features in software development", "groups": [ 3, 18 @@ -96,6 +109,7 @@ "priority": 2 }, "14": { + "description": "Embed videos on websites", "groups": [ 10 ], @@ -103,6 +117,7 @@ "priority": 7 }, "15": { + "description": "Allow users to post and interact with comments on a website", "groups": [ 3, 18 @@ -111,6 +126,7 @@ "priority": 9 }, "16": { + "description": "Technologies that protect websites from vulnerabilities and attacks", "groups": [ 11 ], @@ -118,6 +134,7 @@ "priority": 9 }, "17": { + "description": "Control how fonts are displayed on a webpage", "groups": [ 9 ], @@ -125,6 +142,7 @@ "priority": 9 }, "18": { + "description": "Provide a structure for building web applications", "groups": [ 9 ], @@ -132,6 +150,7 @@ "priority": 7 }, "19": { + "description": "Tools and technologies that don't fit into other categories", "groups": [ 6 ], @@ -139,6 +158,7 @@ "priority": 10 }, "20": { + "description": "Used to create and manage website content, code, and other files", "groups": [ 9 ], @@ -146,6 +166,7 @@ "priority": 4 }, "21": { + "description": "Used to create, manage, and deliver online courses", "groups": [ 3 ], @@ -153,6 +174,7 @@ "priority": 1 }, "22": { + "description": "Software applications that deliver web pages to users' browsers", "groups": [ 7 ], @@ -160,6 +182,7 @@ "priority": 8 }, "23": { + "description": "Technologies that store frequently accessed data closer to the user to improve page load times", "groups": [ 7 ], @@ -167,6 +190,7 @@ "priority": 7 }, "24": { + "description": "Allow users to create and format text within a web application", "groups": [ 3 ], @@ -174,6 +198,7 @@ "priority": 5 }, "25": { + "description": "Libraries used to create visual elements on websites", "groups": [ 9 ], @@ -181,6 +206,7 @@ "priority": 6 }, "26": { + "description": "Used to create mobile applications and mobile-first websites", "groups": [ 9 ], @@ -188,6 +214,7 @@ "priority": 8 }, "27": { + "description": "Build website functionalities and backend services", "groups": [ 9 ], @@ -195,6 +222,7 @@ "priority": 5 }, "28": { + "description": "Run web servers and infrastructure", "groups": [ 7 ], @@ -202,6 +230,7 @@ "priority": 6 }, "29": { + "description": "Embedded on websites to allow users to find specific content", "groups": [ 3 ], @@ -209,6 +238,7 @@ "priority": 4 }, "30": { + "description": "Systems that allow users to send and receive emails through a browser", "groups": [ 4 ], @@ -216,6 +246,7 @@ "priority": 2 }, "31": { + "description": "Distribute website content globally to improve load times for users", "groups": [ 7 ], @@ -223,6 +254,7 @@ "priority": 9 }, "32": { + "description": "Tools that streamline marketing activities", "groups": [ 2 ], @@ -230,6 +262,7 @@ "priority": 9 }, "33": { + "description": "Enhance the capabilities of web servers", "groups": [ 7 ], @@ -237,6 +270,7 @@ "priority": 7 }, "34": { + "description": "Store website information", "groups": [ 7 ], @@ -244,6 +278,7 @@ "priority": 5 }, "35": { + "description": "Technologies that embed maps into websites", "groups": [ 17 ], @@ -251,6 +286,7 @@ "priority": 6 }, "36": { + "description": "Platforms that display ads on websites", "groups": [ 2 ], @@ -258,6 +294,7 @@ "priority": 9 }, "37": { + "description": "Like routers and switches, are fundamental infrastructure for the network", "groups": [ 7 ], @@ -265,6 +302,7 @@ "priority": 2 }, "38": { + "description": "Store and stream media content", "groups": [ 10, 7 @@ -273,6 +311,7 @@ "priority": 1 }, "39": { + "description": "Integrations used for video calls and similar functions", "groups": [ 4 ], @@ -280,6 +319,7 @@ "priority": 9 }, "41": { + "description": "Handle online transactions", "groups": [ 1 ], @@ -287,6 +327,7 @@ "priority": 8 }, "42": { + "description": "Allow the addition of scripts and tracking tools to websites", "groups": [ 8 ], @@ -294,6 +335,7 @@ "priority": 9 }, "44": { + "description": "Systems that automate building, testing, and deploying code", "groups": [ 9 ], @@ -301,6 +343,7 @@ "priority": 3 }, "45": { + "description": "Manage website infrastructure", "groups": [ 7 ], @@ -308,6 +351,7 @@ "priority": 2 }, "46": { + "description": "Technologies that allow for remote administration of systems", "groups": [ 4 ], @@ -315,6 +359,7 @@ "priority": 1 }, "47": { + "description": "Tools that assist in building and debugging websites and applications", "groups": [ 9 ], @@ -322,6 +367,7 @@ "priority": 2 }, "48": { + "description": "Solutions that store web data and assets", "groups": [ 10 ], @@ -329,6 +375,7 @@ "priority": 2 }, "49": { + "description": "Technologies that allow users to subscribe to content updates", "groups": [ 3 ], @@ -336,6 +383,7 @@ "priority": 1 }, "50": { + "description": "Help to manage and organize documents", "groups": [ 3 ], @@ -343,6 +391,7 @@ "priority": 1 }, "51": { + "description": "Provide a visual way to design webpages", "groups": [ 9 ], @@ -350,6 +399,7 @@ "priority": 1 }, "52": { + "description": "Tools that allow for real-time customer support", "groups": [ 4, 16 @@ -358,6 +408,7 @@ "priority": 9 }, "53": { + "description": "Systems that help manage customer interactions", "groups": [ 2, 16 @@ -366,6 +417,7 @@ "priority": 5 }, "54": { + "description": "Tools that help improve website ranking", "groups": [ 2 ], @@ -373,6 +425,7 @@ "priority": 8 }, "55": { + "description": "Tools that manage financial data", "groups": [ 16 ], @@ -380,6 +433,7 @@ "priority": 1 }, "56": { + "description": "Use website visitors' resources without permission", "groups": [ 5 ], @@ -387,6 +441,7 @@ "priority": 5 }, "57": { + "description": "Create static HTML files", "groups": [ 9 ], @@ -394,6 +449,7 @@ "priority": 1 }, "58": { + "description": "Technologies that guide new users", "groups": [ 6 ], @@ -401,6 +457,7 @@ "priority": 8 }, "59": { + "description": "Provide pre-written code", "groups": [ 9 ], @@ -408,6 +465,7 @@ "priority": 9 }, "60": { + "description": "Technologies that package applications with their dependencies", "groups": [ 7 ], @@ -415,6 +473,7 @@ "priority": 8 }, "62": { + "description": "Provides a platform for software development", "groups": [ 7 ], @@ -422,6 +481,7 @@ "priority": 8 }, "63": { + "description": "Provides computing resources", "groups": [ 7 ], @@ -429,6 +489,7 @@ "priority": 8 }, "64": { + "description": "Intercept client requests and forward them to the appropriate server", "groups": [ 7 ], @@ -436,6 +497,7 @@ "priority": 7 }, "65": { + "description": "Distribute incoming traffic to multiple servers", "groups": [ 7 ], @@ -443,6 +505,7 @@ "priority": 7 }, "66": { + "description": "Provide pre-built components to build user interfaces", "groups": [ 9 ], @@ -450,6 +513,7 @@ "priority": 7 }, "67": { + "description": "Tools that manage user consent for cookies", "groups": [ 13 ], @@ -457,6 +521,7 @@ "priority": 9 }, "68": { + "description": "Technologies that help users with disabilities navigate websites", "groups": [ 9 ], @@ -464,6 +529,7 @@ "priority": 9 }, "69": { + "description": "Systems that manage user logins and security", "groups": [ 11 ], @@ -471,6 +537,7 @@ "priority": 6 }, "70": { + "description": "Provide certificates used to encrypt web traffic", "groups": [ 11 ], @@ -478,6 +545,7 @@ "priority": 9 }, "71": { + "description": "Track marketing campaigns on websites", "groups": [ 2 ], @@ -485,6 +553,7 @@ "priority": 9 }, "72": { + "description": "Tools that allow users to book appointments online", "groups": [ 14 ], @@ -492,6 +561,7 @@ "priority": 9 }, "73": { + "description": "Tools that collect user feedback", "groups": [ 8 ], @@ -499,6 +569,7 @@ "priority": 9 }, "74": { + "description": "Tools that help optimize website performance and user experience", "groups": [ 8 ], @@ -506,6 +577,7 @@ "priority": 9 }, "75": { + "description": "Integration technologies that affect user communication", "groups": [ 4, 2 @@ -514,6 +586,7 @@ "priority": 9 }, "76": { + "description": "Tools that tailor website content based on user data", "groups": [ 2 ], @@ -521,6 +594,7 @@ "priority": 9 }, "77": { + "description": "Technologies that display ads to users who have previously interacted with the site", "groups": [ 2 ], @@ -528,6 +602,7 @@ "priority": 9 }, "78": { + "description": "Tools that track performance as experienced by users", "groups": [ 2 ], @@ -535,6 +610,7 @@ "priority": 9 }, "79": { + "description": "Technologies that use user location data for site features", "groups": [ 17 ], @@ -542,6 +618,7 @@ "priority": 9 }, "80": { + "description": "Define the look and feel of WordPress websites", "groups": [ 15 ], @@ -549,6 +626,7 @@ "priority": 7 }, "81": { + "description": "Define the design of Shopify stores", "groups": [ 15 ], @@ -556,6 +634,7 @@ "priority": 7 }, "82": { + "description": "Dictate the appearance of Drupal websites", "groups": [ 15 ], @@ -563,6 +642,7 @@ "priority": 7 }, "83": { + "description": "Technologies that collect information about users", "groups": [ 8 ], @@ -570,6 +650,7 @@ "priority": 9 }, "84": { + "description": "Programs that track user engagement", "groups": [ 1 ], @@ -577,6 +658,7 @@ "priority": 9 }, "85": { + "description": "Technologies that control when features are shown", "groups": [ 9 ], @@ -584,6 +666,7 @@ "priority": 9 }, "86": { + "description": "Tools that group users based on behavior", "groups": [ 2 ], @@ -591,6 +674,7 @@ "priority": 9 }, "87": { + "description": "Extend functionality of WordPress websites", "groups": [ 15 ], @@ -598,6 +682,7 @@ "priority": 8 }, "88": { + "description": "Providers that offer server space for websites", "groups": [ 7 ], @@ -605,6 +690,7 @@ "priority": 9 }, "89": { + "description": "Technologies that make content accessible across languages", "groups": [ 3 ], @@ -612,6 +698,7 @@ "priority": 9 }, "90": { + "description": "Systems that display user feedback", "groups": [ 2, 18 @@ -620,6 +707,7 @@ "priority": 9 }, "91": { + "description": "Systems that allow payment flexibility for online purchases", "groups": [ 1 ], @@ -627,6 +715,7 @@ "priority": 9 }, "92": { + "description": "Tools that measure and optimize site speed", "groups": [ 7 ], @@ -634,6 +723,7 @@ "priority": 9 }, "93": { + "description": "Tools that enable online booking and ordering", "groups": [ 14 ], @@ -641,6 +731,7 @@ "priority": 9 }, "94": { + "description": "Programs that incentivize users to promote a website", "groups": [ 2, 1 @@ -649,6 +740,7 @@ "priority": 9 }, "95": { + "description": "Systems that store and manage media files", "groups": [ 10 ], @@ -656,6 +748,7 @@ "priority": 9 }, "96": { + "description": "Systems that organize content from multiple sources", "groups": [ 2, 18 @@ -664,6 +757,7 @@ "priority": 9 }, "97": { + "description": "Gather customer data from various sources", "groups": [ 2, 8 @@ -672,6 +766,7 @@ "priority": 9 }, "98": { + "description": "Technologies that track and recover abandoned shopping carts", "groups": [ 1 ], @@ -679,6 +774,7 @@ "priority": 9 }, "99": { + "description": "Integration that provides shipping options during checkout", "groups": [ 1 ], @@ -686,6 +782,7 @@ "priority": 9 }, "100": { + "description": "Extend the functionality of Shopify stores", "groups": [ 15 ], @@ -693,6 +790,7 @@ "priority": 8 }, "101": { + "description": "Solutions used for hiring processes", "groups": [ 6, 16 @@ -701,6 +799,7 @@ "priority": 9 }, "102": { + "description": "Technologies that process product returns", "groups": [ 1 ], @@ -708,6 +807,7 @@ "priority": 9 }, "103": { + "description": "Technologies used to broadcast real-time video content", "groups": [ 1, 10 @@ -716,6 +816,7 @@ "priority": 9 }, "104": { + "description": "Systems that allow users to book tickets online", "groups": [ 14 ], @@ -723,6 +824,7 @@ "priority": 9 }, "105": { + "description": "Technologies that provide interactive experiences", "groups": [ 10 ], @@ -730,6 +832,7 @@ "priority": 9 }, "106": { + "description": "Solutions that handle the complexities of selling products internationally", "groups": [ 1 ], @@ -737,6 +840,7 @@ "priority": 6 }, "107": { + "description": "Systems that manage order processing and delivery", "groups": [ 1 ], @@ -744,6 +848,7 @@ "priority": 6 }, "108": { + "description": "The user interface for online stores", "groups": [ 1 ], @@ -751,6 +856,7 @@ "priority": 6 }, "109": { + "description": "Solutions that redirect domains to a different location or page", "groups": [ 6 ], @@ -758,6 +864,7 @@ "priority": 9 }, "110": { + "description": "Technologies that enable users to build forms for data collection", "groups": [ 8 ], @@ -765,10 +872,11 @@ "priority": 8 }, "111": { + "description": "Tools that facilitate online charitable contributions", "groups": [ 6 ], "name": "Fundraising & donations", "priority": 9 } -} \ No newline at end of file +}