Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data Refresh: CDC Wonder Natality #984

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/us_cdc/natality/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def main(argv):

# Aggregating all count stat vars
df_count = df.loc[df['StatVar'].str.startswith('Count')]
df_count.drop('Unit', axis=1, inplace=True) # Count statvars have no unit.
# df_count.drop('Unit', axis=1, inplace=True) # Count statvars have no unit.
df_count.drop_duplicates(subset=['Year', 'Geo', 'StatVar'],
keep='first',
inplace=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"year_range": "16-20",
"year_range": "16-22",
"data_cols": [
"Births",
"Average Age of Mother (years)",
Expand Down Expand Up @@ -160,6 +160,9 @@
"15": {
"mothersAge": "[15 Years]"
},
"15-": {
"mothersAge": "[15 - Years]"
},
"16": {
"mothersAge": "[16 Years]"
},
Expand Down Expand Up @@ -187,6 +190,9 @@
"24": {
"mothersAge": "[24 Years]"
},
"20-24": {
"mothersAge": "[20 24 Years]"
},
"25": {
"mothersAge": "[25 Years]"
},
Expand All @@ -202,6 +208,9 @@
"29": {
"mothersAge": "[29 Years]"
},
"25-29": {
"mothersAge": "[25 29 Years]"
},
"30": {
"mothersAge": "[30 Years]"
},
Expand All @@ -217,6 +226,9 @@
"34": {
"mothersAge": "[34 Years]"
},
"30-34": {
"mothersAge": "[30 34 Years]"
},
"35": {
"mothersAge": "[35 Years]"
},
Expand All @@ -232,6 +244,9 @@
"39": {
"mothersAge": "[39 Years]"
},
"35-39": {
"mothersAge": "[35 39 Years]"
},
"40": {
"mothersAge": "[40 Years]"
},
Expand All @@ -247,6 +262,9 @@
"44": {
"mothersAge": "[44 Years]"
},
"40-44": {
"mothersAge": "[40 44 Years]"
},
"45": {
"mothersAge": "[45 Years]"
},
Expand All @@ -262,6 +280,9 @@
"49": {
"mothersAge": "[49 Years]"
},
"45-49": {
"mothersAge": "[45 49 Years]"
},
"All": {},
"50+": {
"mothersAge": "[50 - Years]"
Expand All @@ -284,10 +305,10 @@
"mothersEthnicity": "CDC_EthnicityUnknownOrNotStated"
},
"All": {},
"2135-2": {
"2135-...": {
"mothersEthnicity": "HispanicOrLatino"
},
"2186-5": {
"2186-...": {
"mothersEthnicity": "NotHispanicOrLatino"
}
},
Expand Down
6 changes: 5 additions & 1 deletion scripts/us_cdc/natality/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,18 @@ def _process_file(file_name: str, csv_reader: csv.DictReader,
# Add key values based on filename
for file_kv in file_name.split('$$'):
key, val = file_kv.split('=')
# To bring year as header and not a pv
if key == "year":
continue
print(key, val)
if val in _CONFIG['filename'][key]:
update_d.update(_CONFIG['filename'][key][val])
else:
return statvars # Skip processing this file, statvars is empty

# Average age does not make sense when mothersAge is a cprop
if key == 'MAge' and val != 'All':
if _CONFIG['year_range'] == '16-20':
if _CONFIG['year_range'] == '16-22':
data_cols.remove('Average Age of Mother (years)')
elif _CONFIG['year_range'] in ['07-20', '03-06']:
data_cols.remove('Average Age of Mother')
Expand Down
2 changes: 1 addition & 1 deletion scripts/us_cdc/natality/preprocess_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_csv(self):
mcf_path = os.path.join(tmp_dir, 'output.mcf')
preprocess_path = os.path.join(_SCRIPT_PATH, 'preprocess.py')
config_path = os.path.join(_SCRIPT_PATH, 'state',
'16-20_state.json')
'16-22_state.json')
input_path = os.path.join(_SCRIPT_PATH, 'testdata', 'cleaned_data')

subprocess.call([
Expand Down
35 changes: 35 additions & 0 deletions scripts/us_cdc/natality/scrape/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Download process for CDC Wonder Natality

Source url: `https://wonder.cdc.gov/natality-expanded-current.html`

## Requirements
Run the following commands to install the required packages

- `pip3 install -r requirements.txt`

- `pip3 install -r scripts/us_cdc/scrape/requirements.txt` (contains selenium)

## Configuration steps
There are certain options to be selected to download the required data

### Web page
- Inspect the page -> Elements

### natality_config.json
- `group_by`: Select the `Geo level` and `year` for every file.
- `measure`: Select the required datasets for example: `Births`, `Birth Rate`, `Fertility Rate` etc. Select the respective `ids` of the datasets.
- `select`: Start selecting additional properties - age, race, education, marital status, ethnicity, nativity.
- If the property is a radio button, give "`radio`" under that id. Unwanted values in a property like "`Exc, NR`" can be excluded - "`exclude_options`"
- While selecting the properties, "`name`" can be given under their
respective `ids` so that it appears in the file name.

## Executing the download script
- For County data.
```shell
python3 scripts/us_cdc/scrape/scrape_cdc_wonder.py --alsologtostderr --download_path=scripts/us_cdc/natality/county/raw_data/ --website=https://wonder.cdc.gov/natality-expanded-current.html --config_path=scripts/us_cdc/scrape/natality_config.json --parallel_download=0 --headless --calculate_combinations --show_totals --include_zeroes
```

- For State data.
```shell
python3 scripts/us_cdc/scrape/scrape_cdc_wonder.py --alsologtostderr --download_path=scripts/us_cdc/natality/state/raw_data/ --website=https://wonder.cdc.gov/natality-expanded-current.html --config_path=scripts/us_cdc/scrape/natality_config.json --parallel_download=0 --headless --calculate_combinations --show_totals --include_zeroes
```
52 changes: 52 additions & 0 deletions scripts/us_cdc/natality/scrape/natality_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"group_by": {
"SB_1": "D149.V21-level2",
"SB_2": "D149.V20"
},
"measure": [
"CM_002",
"CM_070",
"CM_080",
"CM_090",
"CM_095",
"CM_100",
"CM_110",
"CM_120",
"CM_130"
],
"select": {
"RO_ageD149.V39": {
"radio":"RO_ageD149.V39"
},
"SD149.V39":{
"name":"MAge"
},
"SD149.V5": {
"name": "MEdu",
"exclude_options": ["Exc", "NR"]
},
"SD149.V27": {
"name": "MtalStatus",
"exclude_options": ["NR"]
},
"RO_raceD149.V49":{
"radio":"RO_raceD149.V49"
},
"SD149.V49": {
"name": "MRace",
"exclude_options": ["100"]
},
"SD149.V43": {
"name": "MEth"
},
"SD149.V48":{
"name":"MNat",
"exclude_options":["10"]
}
},
"year":{
"_comment": "The source had data devided into sets ex:2003-2006. Downloaded only the latest year data set.",
"element": "SD149.V20",
"include_list":["2016","2017","2018","2019","2020","2021","2022"]
}
}
3 changes: 3 additions & 0 deletions scripts/us_cdc/natality/scrape/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Installing selenium to download the raw data from website

selenium==4.17.2
Loading
Loading