Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix compatibility with deployed static sites #26

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -148,4 +148,5 @@ n
2017/
2018/
2019/
2020/
2020/
.vscode/
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ beautifulsoup4 = "~=4.10.0"

[requires]
python_version = "3.9"

[scripts]
serve = "python -m http.server --cgi 5000"
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,11 @@ In order to better preserve things, we crawl each year's PyCon TW website into s
```

and then access `localhost:[PORT]`.

or run the command:

```bash
pipenv run serve
```

and access `localhost:5000`.
29 changes: 20 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
from loguru import logger

PYCON_YEAR = "2016"
PYCON_URL = "https://tw.pycon.org"
PYCON_HOST = "tw.pycon.org"
PYCON_URL = f"https://{PYCON_HOST}"


def mkdir(path):
path = urlparse(path).path
try:
# 1) correct the path to directory path and be a local path
# 2) by using unquote to avoid the Garbled path
Expand All @@ -27,6 +29,7 @@ def mkdir(path):

def writefile(path):
# request to the Pycon path, and fetch it to local file by using binary writing
path = urlparse(path).path
request = requests.get(PYCON_URL + path)
file = "." + unquote(path)
try:
Expand All @@ -37,6 +40,7 @@ def writefile(path):


def getcssimg(path):
path = urlparse(path).path
# get all url like /year/... target, and try to save them all.
file = "." + path
with open(file, "rb") as f:
Expand Down Expand Up @@ -118,8 +122,8 @@ def get_assets(path: Path):


def get_page(path):
# filter our target path
if path[0] != "/" or Path("." + path + "index.html").exists():
path = urlparse(path).path
if Path("." + path + "index.html").exists(): # Don't crawl same page again in case of infinite loop
return
logger.info("fetching " + PYCON_URL + path)
request = requests.get(PYCON_URL + path)
Expand Down Expand Up @@ -147,9 +151,10 @@ def get_page(path):
html = str(soup)
html = html.replace('method="post"', "")
html = html.replace('action="/' + PYCON_YEAR + '/set-language/"', "")
html = html.replace(
f"/{PYCON_YEAR}/", f"{BASE_URL}/{PYCON_YEAR}/"
) # Replace base url since the gh-pages use base url following `{host}/{repo}/` instead of {host}/
if path.startswith(f"/{PYCON_YEAR}"):
html = html.replace(
f"/{PYCON_YEAR}/", f"{BASE_URL}/{PYCON_YEAR}/"
) # Replace base url since the gh-pages use base url following `{host}/{repo}/` instead of {host}/
full_path = BASE_URL + path
if PYCON_YEAR == "2016":
html = html.replace(
Expand Down Expand Up @@ -285,14 +290,20 @@ def main():

for crawler_url in crawler_urls:
url = urlparse(crawler_url)
# Checking if the url is a pycon website
if url.netloc != PYCON_HOST and url.netloc != "":
continue
# Checking if the path is right or not
if url.netloc == "" and url.path.find(f"/{PYCON_YEAR}") != 0:
continue
path_parts = Path(url.path).parts
if len(path_parts) >= 2 and path_parts[1] == PYCON_YEAR:
get_page(crawler_url)
get_page(crawler_url.replace("zh-hant", "en-us"))
get_page(url.path)
get_page(url.path.replace("zh-hant", "en-us"))

for link in soup.findAll("link", {"rel": "icon"}):
if "href" in link.attrs:
get_assets(Path(link["href"]))
get_assets(Path(urlparse(link["href"]).path))


@click.command()
Expand Down