Fixes Alternative Data Code Generator

QuantConnect · Nov 30, 2024 · fb27164 · fb27164
1 parent f1f9685
commit fb27164
Showing 1 changed file with 106 additions and 103 deletions.
diff --git a/code-generators/Alternative-Datasets-Code-Generator.py b/code-generators/Alternative-Datasets-Code-Generator.py
@@ -1,13 +1,24 @@
 import os
-import pathlib
-import shutil
-from urllib.request import urlopen
+from pathlib import Path
+from itertools import groupby
+from shutil import move, rmtree
 from bs4 import BeautifulSoup
-
-def metadata_content(vendor, dataset):
+from _code_generation_helpers import get_json_content
+
+DATASET = "03 Writing Algorithms/14 Datasets"
+
+def _directory_content(path):
+    def append_path(v):
+        return [Path(f'{path}/{x}') for x in v]
+    key = lambda x: x[3:]
+    grouped = groupby(sorted(os.listdir(path), key=key), key)
+    return {k: append_path(v) for k,v in grouped}
+
+def _write_metadata_file(folder, vendor, dataset):
     vendor_ = vendor.lower().replace(" ", "-")
     dataset_ = dataset.lower().replace(" ", "-")
-    return f'''{{
+    with open(folder / f'metadata.json', "w", encoding="utf-8") as json_file:
+        json_file.write(f'''{{
     "type": "metadata",
     "values": {{
         "description": "{dataset} dataset from {vendor}.",
@@ -18,106 +29,98 @@ def metadata_content(vendor, dataset):
         "og:site_name": "{dataset} - QuantConnect.com",
         "og:image": "https://cdn.quantconnect.com/docs/i/writing-algorithms/datasets/{vendor_}/{dataset_}.png"
     }}
-}}'''
-
-for clean_up in os.listdir('03 Writing Algorithms/14 Datasets'):
-    if not '01 Overview' in clean_up and not "readme" in clean_up:
-        destination = '03 Writing Algorithms/14 Datasets/' + clean_up
-        temp = 'tmp/03 Writing Algorithms/14 Datasets/' + clean_up
-        if os.path.isdir(destination):
-            shutil.copytree(destination, temp, dirs_exist_ok=True,
-                            ignore=lambda dir, files: [f for f in files if os.path.isfile(os.path.join(dir, f)) and str(f) != "metadata.json" and str(f) != "00.json"])
-            shutil.rmtree(destination)
-            shutil.copytree(temp, destination, dirs_exist_ok=True)
-            shutil.rmtree(temp)
-
-url = urlopen("https://s3.amazonaws.com/cdn.quantconnect.com/web/docs/alternative-data-dump-v2024-01-02.json")
-response = url.read().decode("utf-8") \
-    .replace("true", "True") \
-    .replace("false", "False") \
-    .replace("null", "None")
-doc = eval(response)
-
-languages = {"language-python": "python", "language-cs": "csharp"}
-vendor_count = 2
-vendors = {}
-product_count = {}
-attr = False
-
-priority = ["QuantConnect", "AlgoSeek", "Morningstar", "TickData", "CoinAPI", "OANDA"]
-vendor_names = priority + sorted([x for x in [dataset["vendorName"].strip() for dataset in doc] if x not in priority])
-for vendor in vendor_names:
-    if vendor not in vendors:
-        vendors[vendor] = vendor_count
-        product_count[vendor] = {dataset["name"].strip(): m+1 for m, dataset in enumerate(sorted([x for x in doc if x["vendorName"].strip() == vendor], key=lambda x: x['name']))}
-        vendor_count += 1
-
-universe_html = """<p>The following alternative datasets support universe selection:</p>
+}}''')
+
+if __name__ == '__main__':
+
+    url = "https://s3.amazonaws.com/cdn.quantconnect.com/web/docs/alternative-data-dump-v2024-01-02.json"
+    docs = sorted(get_json_content(url), key=lambda x: x["vendorName"].strip())
+
+    docs_by_vendor = {k : sorted(v, key=lambda x: x['name'].strip())
+        for k,v in groupby(docs, lambda x: x["vendorName"].strip())}
+
+    priority = ["QuantConnect", "AlgoSeek", "Morningstar", "TickData", "CoinAPI", "OANDA"]
+    vendors = [x for x in priority if x in docs_by_vendor]
+    vendors += [k for k in docs_by_vendor if k not in vendors]
+
+    current = _directory_content(DATASET)
+    current.pop('Overview')
+
+    languages = {"language-python": "python", "language-cs": "csharp"}
+
+    universe_html = """<p>The following alternative datasets support universe selection:</p>
 <ul>
 """
-
-for dataset in doc:
-    i = 1
-    vendorName = dataset["vendorName"].strip()
-    datasetName = dataset["name"].strip()
-
-    # Create path if not exist
-    main_dir = f'03 Writing Algorithms/14 Datasets/{vendors[vendorName]:02} {vendorName}/{product_count[vendorName][datasetName]:02} {datasetName}'
-    destination_folder = pathlib.Path(main_dir)
-    destination_folder.mkdir(parents=True, exist_ok=True)
-
-    with open(destination_folder / f'metadata.json', "w", encoding="utf-8") as json_file:
-        metadata = metadata_content(vendorName, datasetName)
-        json_file.write(metadata)
-
-    all_sections = {**{item["title"]: item["content"] for item in dataset["about"] if item["title"]},
-                    **{item["title"]: item["content"] for item in dataset["documentation"] if item["title"]}}
-
-    for title, content in all_sections.items():        
-        content = content.replace("\/", "/") \
-                    .replace("https://www.quantconnect.com/docs/v2/", "/docs/v2/") \
-                    .replace("https://www.quantconnect.com/datasets/", "/datasets/") \
-                    .replace('<div class="qc-embed-frame"><div class="qc-embed-dummy"></div><div class="qc-embed-element"><iframe class="qc-embed-backtest"',
-                                            '<div class="qc-embed-frame python" style="display: inline-block; position: relative; width: 100%; min-height: 100px; min-width: 300px;"><div class="qc-embed-dummy" style="padding-top: 56.25%;"></div><div class="qc-embed-element" style="position: absolute; top: 0; bottom: 0; left: 0; right: 0;"><iframe class="qc-embed-backtest"')
-        soup = BeautifulSoup(content, 'html.parser')
-        for code_section in soup.find_all("div", class_="section-example-container"):
-            for pre_code_section in soup.find_all("pre"):
-                for old, new in languages.items():
-                    for code_snippet in pre_code_section.find_all('code', {'class' : old}):
-                        converted = f'{code_snippet}'.replace('code', 'pre').replace(old, new)
-                        content = content.replace(f'{pre_code_section}', converted)
-
-        if title.lower() == "example applications":
-            start = content.find('<div class="dataset-embeds">')
-            if start > 0:
-                text = ''
-                for old, new in languages.items():
-                    for code_section in soup.find_all("div", class_=f"qc-embed-frame {old}"):
-                        text += f"\n<div class='{new}'><div class='qc-embed-frame' style='display: inline-block; position: relative; width: 100%; min-height: 100px; min-width: 300px;'><div class='qc-embed-dummy' style='padding-top: 56.25%;'></div><div class='qc-embed-element' style='position: absolute; top: 0; bottom: 0; left: 0; right: 0;'><iframe class='qc-embed-backtest' src='https://www.quantconnect.com/terminal/processCache?request=embedded_backtest{str(code_section)[-61:-28]}.html' style='max-width: calc(100vw - 30px); max-height: 100vw; overflow: hidden;' scrolling='no' width='100%' height='100%'></iframe></div></div></div>"
-                end = start + len(str(soup.find_all("div", class_="dataset-embeds")))
-                content = content.replace(content[start:end], text)
-
-            with open(destination_folder / f'99 {title.strip()}.html', "w", encoding="utf-8") as html_file:
-                html_file.write(content)
-        else:
-            for old, new in languages.items():
-                content = content.replace(old, new)
-            if title.lower() == "introduction":
-                backslash = '\\'
-                content += f"""
-                
-<p>For more information about the {datasetName} dataset, including CLI commands and pricing, see the <a href=\"{dataset['url'].lower().replace(backslash, '')}\">dataset listing</a>.<p>"""
-
-        if title.strip() == "Universe Selection" and vendorName not in priority:
-            universe_html += f"""    <li><a href="/docs/v2/writing-algorithms/datasets/{vendorName.lower().replace(' ', '-')}/{datasetName.lower().replace(' ', '-')}#{i:02}-Universe-Selection">{datasetName}</a></li>
+
+    for i, vendor in enumerate(vendors):
+        vendor_folder = Path(f'{DATASET}/{i+2:02} {vendor}')
+        vendor_folder.mkdir(parents=True, exist_ok=True)
+        for path in current.pop(vendor, []):
+            move(path, vendor_folder)
+        datasets = _directory_content(vendor_folder)
+        for f, dataset in enumerate(docs_by_vendor.pop(vendor)):
+            name = dataset['name'].strip()
+            folder = Path(f'{vendor_folder}/{f+1:02} {name}')
+            folder.mkdir(parents=True, exist_ok=True)
+            for path in datasets.pop(name, []):
+                move(path, folder)
+
+            [os.remove(x) for x in folder.iterdir() if x.suffix == '.html'] 
+
+            _write_metadata_file(folder, vendor, name)
+
+            all_sections = {**{item["title"]: item["content"] for item in dataset["about"] if item["title"]},
+                            **{item["title"]: item["content"] for item in dataset["documentation"] if item["title"]}}
+
+            for k, (title, content) in enumerate(all_sections.items()):        
+                content = content.replace("\/", "/") \
+                            .replace("->", "-&gt;") \
+                            .replace("=>", "=&gt;") \
+                            .replace("=<", "=&lt;") \
+                            .replace("https://www.quantconnect.com/docs/v2/", "/docs/v2/") \
+                            .replace("https://www.quantconnect.com/datasets/", "/datasets/") \
+                            .replace('<div class="qc-embed-frame"><div class="qc-embed-dummy"></div><div class="qc-embed-element"><iframe class="qc-embed-backtest"',
+                                     '<div class="qc-embed-frame python" style="display: inline-block; position: relative; width: 100%; min-height: 100px; min-width: 300px;"><div class="qc-embed-dummy" style="padding-top: 56.25%;"></div><div class="qc-embed-element" style="position: absolute; top: 0; bottom: 0; left: 0; right: 0;"><iframe class="qc-embed-backtest"')
+                soup = BeautifulSoup(content, 'html.parser')
+                for code_section in soup.find_all("div", class_="section-example-container"):
+                    for pre_code_section in soup.find_all("pre"):
+                        for old, new in languages.items():
+                            for code_snippet in pre_code_section.find_all('code', {'class' : old}):
+                                converted = f'{code_snippet}'.replace('code', 'pre').replace(old, new)
+                                content = content.replace(f'{pre_code_section}', converted)
+
+                if title.lower() == "example applications":
+                    start = content.find('<div class="dataset-embeds">')
+                    if start > 0:
+                        text = ''
+                        for old, new in languages.items():
+                            for code_section in soup.find_all("div", class_=f"qc-embed-frame {old}"):
+                                text += f"\n<div class='{new}'><div class='qc-embed-frame' style='display: inline-block; position: relative; width: 100%; min-height: 100px; min-width: 300px;'><div class='qc-embed-dummy' style='padding-top: 56.25%;'></div><div class='qc-embed-element' style='position: absolute; top: 0; bottom: 0; left: 0; right: 0;'><iframe class='qc-embed-backtest' src='https://www.quantconnect.com/terminal/processCache?request=embedded_backtest{str(code_section)[-61:-28]}.html' style='max-width: calc(100vw - 30px); max-height: 100vw; overflow: hidden;' scrolling='no' width='100%' height='100%'></iframe></div></div></div>"
+                        end = start + len(str(soup.find_all("div", class_="dataset-embeds")))
+                        content = content.replace(content[start:end], text)
+
+                    with open(folder / f'99 {title.strip()}.html', "w", encoding="utf-8") as html_file:
+                        html_file.write(content)
+                else:
+                    for old, new in languages.items():
+                        content = content.replace(old, new)
+                    if title.lower() == "introduction":
+                        backslash = '\\'
+                        content += f"""
+
+<p>For more information about the {name} dataset, including CLI commands and pricing, see the <a href=\"{dataset['url'].lower().replace(backslash, '')}\">dataset listing</a>.<p>"""
+
+                if title.strip() == "Universe Selection" and vendor not in priority:
+                    universe_html += f"""    <li><a href="/docs/v2/writing-algorithms/datasets/{vendor.lower().replace(' ', '-')}/{name.lower().replace(' ', '-')}#{k+1:02}-Universe-Selection">{name}</a></li>
 """
 
-        with open(destination_folder / f'{i:02} {title.strip()}.html', "w", encoding="utf-8") as html_file:
-            html_file.write(content)
-            i += 1
+                with open(folder / f'{1+k:02} {title.strip()}.html', "w", encoding="utf-8") as html_file:
+                    html_file.write(content)
 
-    print(f'Documentation of {dataset["name"]} is generated and inplace!')
+            print(f'Documentation of {dataset["name"]} is generated and inplace!')
 
-universe_html += "</ul>"
-with open('Resources/datasets/supported-alternative-dataset-universe.html', "w", encoding="utf-8") as html_file:
-    html_file.write(universe_html)
+        [rmtree(path) for paths in datasets.values() for path in paths if path.is_dir()]
+    [rmtree(path) for paths in current.values() for path in paths if path.is_dir()]
+
+    with open('Resources/datasets/supported-alternative-dataset-universe.html', "w", encoding="utf-8") as html_file:
+        html_file.write(universe_html + "</ul>")