Skip to content

Commit

Permalink
Fixes Alternative Data Code Generator
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexCatarino committed Nov 30, 2024
1 parent f1f9685 commit fb27164
Showing 1 changed file with 106 additions and 103 deletions.
209 changes: 106 additions & 103 deletions code-generators/Alternative-Datasets-Code-Generator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
import os
import pathlib
import shutil
from urllib.request import urlopen
from pathlib import Path
from itertools import groupby
from shutil import move, rmtree
from bs4 import BeautifulSoup

def metadata_content(vendor, dataset):
from _code_generation_helpers import get_json_content

DATASET = "03 Writing Algorithms/14 Datasets"

def _directory_content(path):
def append_path(v):
return [Path(f'{path}/{x}') for x in v]
key = lambda x: x[3:]
grouped = groupby(sorted(os.listdir(path), key=key), key)
return {k: append_path(v) for k,v in grouped}

def _write_metadata_file(folder, vendor, dataset):
vendor_ = vendor.lower().replace(" ", "-")
dataset_ = dataset.lower().replace(" ", "-")
return f'''{{
with open(folder / f'metadata.json', "w", encoding="utf-8") as json_file:
json_file.write(f'''{{
"type": "metadata",
"values": {{
"description": "{dataset} dataset from {vendor}.",
Expand All @@ -18,106 +29,98 @@ def metadata_content(vendor, dataset):
"og:site_name": "{dataset} - QuantConnect.com",
"og:image": "https://cdn.quantconnect.com/docs/i/writing-algorithms/datasets/{vendor_}/{dataset_}.png"
}}
}}'''

for clean_up in os.listdir('03 Writing Algorithms/14 Datasets'):
if not '01 Overview' in clean_up and not "readme" in clean_up:
destination = '03 Writing Algorithms/14 Datasets/' + clean_up
temp = 'tmp/03 Writing Algorithms/14 Datasets/' + clean_up
if os.path.isdir(destination):
shutil.copytree(destination, temp, dirs_exist_ok=True,
ignore=lambda dir, files: [f for f in files if os.path.isfile(os.path.join(dir, f)) and str(f) != "metadata.json" and str(f) != "00.json"])
shutil.rmtree(destination)
shutil.copytree(temp, destination, dirs_exist_ok=True)
shutil.rmtree(temp)

url = urlopen("https://s3.amazonaws.com/cdn.quantconnect.com/web/docs/alternative-data-dump-v2024-01-02.json")
response = url.read().decode("utf-8") \
.replace("true", "True") \
.replace("false", "False") \
.replace("null", "None")
doc = eval(response)

languages = {"language-python": "python", "language-cs": "csharp"}
vendor_count = 2
vendors = {}
product_count = {}
attr = False

priority = ["QuantConnect", "AlgoSeek", "Morningstar", "TickData", "CoinAPI", "OANDA"]
vendor_names = priority + sorted([x for x in [dataset["vendorName"].strip() for dataset in doc] if x not in priority])
for vendor in vendor_names:
if vendor not in vendors:
vendors[vendor] = vendor_count
product_count[vendor] = {dataset["name"].strip(): m+1 for m, dataset in enumerate(sorted([x for x in doc if x["vendorName"].strip() == vendor], key=lambda x: x['name']))}
vendor_count += 1

universe_html = """<p>The following alternative datasets support universe selection:</p>
}}''')

if __name__ == '__main__':

url = "https://s3.amazonaws.com/cdn.quantconnect.com/web/docs/alternative-data-dump-v2024-01-02.json"
docs = sorted(get_json_content(url), key=lambda x: x["vendorName"].strip())

docs_by_vendor = {k : sorted(v, key=lambda x: x['name'].strip())
for k,v in groupby(docs, lambda x: x["vendorName"].strip())}

priority = ["QuantConnect", "AlgoSeek", "Morningstar", "TickData", "CoinAPI", "OANDA"]
vendors = [x for x in priority if x in docs_by_vendor]
vendors += [k for k in docs_by_vendor if k not in vendors]

current = _directory_content(DATASET)
current.pop('Overview')

languages = {"language-python": "python", "language-cs": "csharp"}

universe_html = """<p>The following alternative datasets support universe selection:</p>
<ul>
"""

for dataset in doc:
i = 1
vendorName = dataset["vendorName"].strip()
datasetName = dataset["name"].strip()

# Create path if not exist
main_dir = f'03 Writing Algorithms/14 Datasets/{vendors[vendorName]:02} {vendorName}/{product_count[vendorName][datasetName]:02} {datasetName}'
destination_folder = pathlib.Path(main_dir)
destination_folder.mkdir(parents=True, exist_ok=True)

with open(destination_folder / f'metadata.json', "w", encoding="utf-8") as json_file:
metadata = metadata_content(vendorName, datasetName)
json_file.write(metadata)

all_sections = {**{item["title"]: item["content"] for item in dataset["about"] if item["title"]},
**{item["title"]: item["content"] for item in dataset["documentation"] if item["title"]}}

for title, content in all_sections.items():
content = content.replace("\/", "/") \
.replace("https://www.quantconnect.com/docs/v2/", "/docs/v2/") \
.replace("https://www.quantconnect.com/datasets/", "/datasets/") \
.replace('<div class="qc-embed-frame"><div class="qc-embed-dummy"></div><div class="qc-embed-element"><iframe class="qc-embed-backtest"',
'<div class="qc-embed-frame python" style="display: inline-block; position: relative; width: 100%; min-height: 100px; min-width: 300px;"><div class="qc-embed-dummy" style="padding-top: 56.25%;"></div><div class="qc-embed-element" style="position: absolute; top: 0; bottom: 0; left: 0; right: 0;"><iframe class="qc-embed-backtest"')
soup = BeautifulSoup(content, 'html.parser')
for code_section in soup.find_all("div", class_="section-example-container"):
for pre_code_section in soup.find_all("pre"):
for old, new in languages.items():
for code_snippet in pre_code_section.find_all('code', {'class' : old}):
converted = f'{code_snippet}'.replace('code', 'pre').replace(old, new)
content = content.replace(f'{pre_code_section}', converted)

if title.lower() == "example applications":
start = content.find('<div class="dataset-embeds">')
if start > 0:
text = ''
for old, new in languages.items():
for code_section in soup.find_all("div", class_=f"qc-embed-frame {old}"):
text += f"\n<div class='{new}'><div class='qc-embed-frame' style='display: inline-block; position: relative; width: 100%; min-height: 100px; min-width: 300px;'><div class='qc-embed-dummy' style='padding-top: 56.25%;'></div><div class='qc-embed-element' style='position: absolute; top: 0; bottom: 0; left: 0; right: 0;'><iframe class='qc-embed-backtest' src='https://www.quantconnect.com/terminal/processCache?request=embedded_backtest{str(code_section)[-61:-28]}.html' style='max-width: calc(100vw - 30px); max-height: 100vw; overflow: hidden;' scrolling='no' width='100%' height='100%'></iframe></div></div></div>"
end = start + len(str(soup.find_all("div", class_="dataset-embeds")))
content = content.replace(content[start:end], text)

with open(destination_folder / f'99 {title.strip()}.html', "w", encoding="utf-8") as html_file:
html_file.write(content)
else:
for old, new in languages.items():
content = content.replace(old, new)
if title.lower() == "introduction":
backslash = '\\'
content += f"""
<p>For more information about the {datasetName} dataset, including CLI commands and pricing, see the <a href=\"{dataset['url'].lower().replace(backslash, '')}\">dataset listing</a>.<p>"""

if title.strip() == "Universe Selection" and vendorName not in priority:
universe_html += f""" <li><a href="/docs/v2/writing-algorithms/datasets/{vendorName.lower().replace(' ', '-')}/{datasetName.lower().replace(' ', '-')}#{i:02}-Universe-Selection">{datasetName}</a></li>

for i, vendor in enumerate(vendors):
vendor_folder = Path(f'{DATASET}/{i+2:02} {vendor}')
vendor_folder.mkdir(parents=True, exist_ok=True)
for path in current.pop(vendor, []):
move(path, vendor_folder)
datasets = _directory_content(vendor_folder)
for f, dataset in enumerate(docs_by_vendor.pop(vendor)):
name = dataset['name'].strip()
folder = Path(f'{vendor_folder}/{f+1:02} {name}')
folder.mkdir(parents=True, exist_ok=True)
for path in datasets.pop(name, []):
move(path, folder)

[os.remove(x) for x in folder.iterdir() if x.suffix == '.html']

_write_metadata_file(folder, vendor, name)

all_sections = {**{item["title"]: item["content"] for item in dataset["about"] if item["title"]},
**{item["title"]: item["content"] for item in dataset["documentation"] if item["title"]}}

for k, (title, content) in enumerate(all_sections.items()):
content = content.replace("\/", "/") \
.replace("->", "-&gt;") \
.replace("=>", "=&gt;") \
.replace("=<", "=&lt;") \
.replace("https://www.quantconnect.com/docs/v2/", "/docs/v2/") \
.replace("https://www.quantconnect.com/datasets/", "/datasets/") \
.replace('<div class="qc-embed-frame"><div class="qc-embed-dummy"></div><div class="qc-embed-element"><iframe class="qc-embed-backtest"',
'<div class="qc-embed-frame python" style="display: inline-block; position: relative; width: 100%; min-height: 100px; min-width: 300px;"><div class="qc-embed-dummy" style="padding-top: 56.25%;"></div><div class="qc-embed-element" style="position: absolute; top: 0; bottom: 0; left: 0; right: 0;"><iframe class="qc-embed-backtest"')
soup = BeautifulSoup(content, 'html.parser')
for code_section in soup.find_all("div", class_="section-example-container"):
for pre_code_section in soup.find_all("pre"):
for old, new in languages.items():
for code_snippet in pre_code_section.find_all('code', {'class' : old}):
converted = f'{code_snippet}'.replace('code', 'pre').replace(old, new)
content = content.replace(f'{pre_code_section}', converted)

if title.lower() == "example applications":
start = content.find('<div class="dataset-embeds">')
if start > 0:
text = ''
for old, new in languages.items():
for code_section in soup.find_all("div", class_=f"qc-embed-frame {old}"):
text += f"\n<div class='{new}'><div class='qc-embed-frame' style='display: inline-block; position: relative; width: 100%; min-height: 100px; min-width: 300px;'><div class='qc-embed-dummy' style='padding-top: 56.25%;'></div><div class='qc-embed-element' style='position: absolute; top: 0; bottom: 0; left: 0; right: 0;'><iframe class='qc-embed-backtest' src='https://www.quantconnect.com/terminal/processCache?request=embedded_backtest{str(code_section)[-61:-28]}.html' style='max-width: calc(100vw - 30px); max-height: 100vw; overflow: hidden;' scrolling='no' width='100%' height='100%'></iframe></div></div></div>"
end = start + len(str(soup.find_all("div", class_="dataset-embeds")))
content = content.replace(content[start:end], text)

with open(folder / f'99 {title.strip()}.html', "w", encoding="utf-8") as html_file:
html_file.write(content)
else:
for old, new in languages.items():
content = content.replace(old, new)
if title.lower() == "introduction":
backslash = '\\'
content += f"""
<p>For more information about the {name} dataset, including CLI commands and pricing, see the <a href=\"{dataset['url'].lower().replace(backslash, '')}\">dataset listing</a>.<p>"""

if title.strip() == "Universe Selection" and vendor not in priority:
universe_html += f""" <li><a href="/docs/v2/writing-algorithms/datasets/{vendor.lower().replace(' ', '-')}/{name.lower().replace(' ', '-')}#{k+1:02}-Universe-Selection">{name}</a></li>
"""

with open(destination_folder / f'{i:02} {title.strip()}.html', "w", encoding="utf-8") as html_file:
html_file.write(content)
i += 1
with open(folder / f'{1+k:02} {title.strip()}.html', "w", encoding="utf-8") as html_file:
html_file.write(content)

print(f'Documentation of {dataset["name"]} is generated and inplace!')
print(f'Documentation of {dataset["name"]} is generated and inplace!')

universe_html += "</ul>"
with open('Resources/datasets/supported-alternative-dataset-universe.html', "w", encoding="utf-8") as html_file:
html_file.write(universe_html)
[rmtree(path) for paths in datasets.values() for path in paths if path.is_dir()]
[rmtree(path) for paths in current.values() for path in paths if path.is_dir()]

with open('Resources/datasets/supported-alternative-dataset-universe.html', "w", encoding="utf-8") as html_file:
html_file.write(universe_html + "</ul>")

0 comments on commit fb27164

Please sign in to comment.