Skip to content

Commit

Permalink
Refactor as not to call update() from add()`
Browse files Browse the repository at this point in the history
A lot of the logic of `update` is needed by `add`:
* codelist file parsing (indirectly)
* file downloading
* manifest handling

This commit pulls out those needed parts into
their own functions, and pulls out
parse_codelist_file's line-parsing logic into its
own function.

This allows us to check the line we're about to
add to the codelists file is valid before doing
so, check that once we add the line to the
codelists file that it is valid, leave
pre-existing codelists and their corresponding
manifest file entries intact.
  • Loading branch information
Jongmassey committed Jan 18, 2025
1 parent e421132 commit 3f29fdb
Showing 1 changed file with 114 additions and 74 deletions.
188 changes: 114 additions & 74 deletions opensafely/codelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,57 +80,79 @@ def add(codelist_url, codelists_dir=None):

# trim any anchors from the end
if "#" in line:
line = line[:line.find("#")]
line = line[: line.find("#")]

# turn a download url into a base codelist version url
if line.endswith(".csv"):
line = line[:line.rfind("/")]
line = line[: line.rfind("/")]

# parse the line to make sure it's valid before adding to file
codelist = parse_codelist_file_line(line)

with codelists_file.open("r+") as f:
if not f.readlines()[-1].endswith("\n"):
line = "\n" + line
f.write(line + "\n")
update(codelists_dir, delete_unknown=False)

# parse the codelists file to make sure it's valid
parse_codelist_file(codelists_dir)

# download to the codelists dir
codelist.filename = codelists_dir / codelist.filename
downloaded_on, sha = fetch_codelist(codelist)

write_manifest(codelists_dir, [(codelist, downloaded_on, sha)], True)

def update(codelists_dir=None, delete_unknown=True):

def update(codelists_dir=None):
if not codelists_dir:
codelists_dir = Path.cwd() / CODELISTS_DIR
codelists = parse_codelist_file(codelists_dir)
old_files = set(codelists_dir.glob("*.csv"))
new_files = set()
manifest = {"files": {}}
downloaded_codelists = []
for codelist in codelists:
print(f"Fetching {codelist.id}")
try:
response = requests.get(codelist.download_url)
response.raise_for_status()
except Exception as e:
exit_with_error(
f"Error downloading codelist: {e}\n\n"
f"Check that you can access the codelist at:\n{codelist.url}"
)
codelist.filename.write_bytes(response.content)
downloaded_at, sha = fetch_codelist(codelist)
downloaded_codelists.append((codelist, downloaded_at, sha))
new_files.add(codelist.filename)
write_manifest(codelists_dir, downloaded_codelists, False)
for file in old_files - new_files:
print(f"Deleting {file.name}")
file.unlink()
return True


def write_manifest(codelists_dir, downloaded_codelists, append):
manifest = {"files": {}}
for codelist, downloaded_at, sha in downloaded_codelists:
key = str(codelist.filename.relative_to(codelists_dir))
manifest["files"][key] = {
"id": codelist.id,
"url": codelist.url,
"downloaded_at": f"{datetime.datetime.utcnow()}Z",
"sha": hash_bytes(response.content),
"downloaded_at": f"{downloaded_at}Z",
"sha": sha,
}
manifest_file = codelists_dir / MANIFEST_FILE
preserve_download_dates(manifest, manifest_file)
if append:
old_manifest = get_manifest(codelists_dir)
manifest["files"].update(old_manifest["files"])
else:
preserve_download_dates(manifest, manifest_file)
manifest_file.write_text(json.dumps(manifest, indent=2))
for file in old_files - new_files:
if delete_unknown:
print(f"Deleting {file.name}")
file.unlink()
else:
print(
f"Unknown file in codelists directory {file.name}\nwould be deleted by 'opensafely codelists update'"
)
return True


def fetch_codelist(codelist):
try:
response = requests.get(codelist.download_url)
response.raise_for_status()
except Exception as e:
exit_with_error(
f"Error downloading codelist: {e}\n\n"
f"Check that you can access the codelist at:\n{codelist.url}"
)
codelist.filename.write_bytes(response.content)
return (datetime.datetime.utcnow(), hash_bytes(response.content))


def get_codelists_dir(codelists_dir=None):
Expand Down Expand Up @@ -202,30 +224,7 @@ def check():
return True

codelists = parse_codelist_file(codelists_dir)
manifest_file = codelists_dir / MANIFEST_FILE
if not manifest_file.exists():
# This is here so that switching to use this test in Github Actions
# doesn't cause existing repos which previously passed to start
# failing. It works by creating a temporary manifest file and then
# checking against that. Functionally, this is the same as the old test
# which would check against the OpenCodelists website every time.
if os.environ.get("GITHUB_WORKFLOW"):
print(
"==> WARNING\n"
" Using temporary workaround for Github Actions tests.\n"
" You should run: opensafely codelists update\n"
)
manifest = make_temporary_manifest(codelists_dir)
else:
exit_with_prompt(f"No file found at '{CODELISTS_DIR}/{MANIFEST_FILE}'.")
else:
try:
manifest = json.loads(manifest_file.read_text())
except json.decoder.JSONDecodeError:
exit_with_prompt(
f"'{CODELISTS_DIR}/{MANIFEST_FILE}' is invalid.\n"
"Note that this file is automatically generated and should not be manually edited.\n"
)
manifest = get_manifest(codelists_dir)
all_ids = {codelist.id for codelist in codelists}
ids_in_manifest = {f["id"] for f in manifest["files"].values()}
if all_ids != ids_in_manifest:
Expand Down Expand Up @@ -264,6 +263,35 @@ def check():
return True


def get_manifest(codelists_dir):
manifest_file = codelists_dir / MANIFEST_FILE
if not manifest_file.exists():
# This is here so that switching to use this test in Github Actions
# doesn't cause existing repos which previously passed to start
# failing. It works by creating a temporary manifest file and then
# checking against that. Functionally, this is the same as the old test
# which would check against the OpenCodelists website every time.
if os.environ.get("GITHUB_WORKFLOW"):
print(
"==> WARNING\n"
" Using temporary workaround for Github Actions tests.\n"
" You should run: opensafely codelists update\n"
)
manifest = make_temporary_manifest(codelists_dir)
else:
exit_with_prompt(f"No file found at '{CODELISTS_DIR}/{MANIFEST_FILE}'.")
else:
try:
manifest = json.loads(manifest_file.read_text())
except json.decoder.JSONDecodeError:
exit_with_prompt(
f"'{CODELISTS_DIR}/{MANIFEST_FILE}' is invalid.\n"
"Note that this file is automatically generated and should not be manually edited.\n"
)

return manifest


def make_temporary_manifest(codelists_dir):
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
Expand All @@ -277,6 +305,8 @@ def make_temporary_manifest(codelists_dir):
@dataclasses.dataclass
class Codelist:
id: str # noqa: A003
codelist: str
version: str
url: str
download_url: str
filename: Path
Expand All @@ -291,41 +321,51 @@ def get_codelist_file(codelists_dir):
return codelists_file


def parse_codelist_file_line(line):
line = line.strip().rstrip("/")
if not line or line.startswith("#"):
return
tokens = line.split("/")
if len(tokens) not in [3, 4]:
exit_with_error(
f"{line} does not match [project]/[codelist]/[version] "
"or user/[username]/[codelist]/[version]"
)
line_without_version = "/".join(tokens[:-1])
line_version = tokens[-1]

url = f"{OPENCODELISTS_BASE_URL}/codelist/{line}/"
filename = "-".join(tokens[:-1]) + ".csv"
return Codelist(
id=line,
codelist=line_without_version,
version=line_version,
url=url,
download_url=f"{url}download.csv",
filename=Path(filename),
)


def parse_codelist_file(codelists_dir):
codelists_file = get_codelist_file(codelists_dir)
codelists = []
codelist_versions = {}
for line in codelists_file.read_text().splitlines():
line = line.strip().rstrip("/")
if not line or line.startswith("#"):
codelist = parse_codelist_file_line(line)
if not codelist:
continue
tokens = line.split("/")
if len(tokens) not in [3, 4]:
exit_with_error(
f"{line} does not match [project]/[codelist]/[version] "
"or user/[username]/[codelist]/[version]"
)
line_without_version = "/".join(tokens[:-1])
existing_version = codelist_versions.get(line_without_version)
line_version = tokens[-1]
if existing_version == line_version:
codelist.filename = codelists_dir / codelist.filename
existing_version = codelist_versions.get(codelist.codelist)

if existing_version == codelist.version:
exit_with_error(f"{line} is a duplicate of a previous line")
if existing_version is not None:
exit_with_error(
f"{line} conflicts with a different version of the same codelist: {existing_version}"
)
codelist_versions[line_without_version] = line_version

url = f"{OPENCODELISTS_BASE_URL}/codelist/{line}/"
filename = "-".join(tokens[:-1]) + ".csv"
codelists.append(
Codelist(
id=line,
url=url,
download_url=f"{url}download.csv",
filename=codelists_dir / filename,
)
)
codelist_versions[codelist.codelist] = codelist.version
codelists.append(codelist)

return codelists


Expand Down

0 comments on commit 3f29fdb

Please sign in to comment.