datacommonsorg · HarishC727 · Feb 16, 2024 · Feb 26, 2024 · Oct 1, 2024 · Oct 21, 2024
diff --git a/scripts/us_cdc/natality/aggregate.py b/scripts/us_cdc/natality/aggregate.py
@@ -30,7 +30,7 @@ def main(argv):
 
     # Aggregating all count stat vars
     df_count = df.loc[df['StatVar'].str.startswith('Count')]
-    df_count.drop('Unit', axis=1, inplace=True)  # Count statvars have no unit.
+    # df_count.drop('Unit', axis=1, inplace=True)  # Count statvars have no unit.
     df_count.drop_duplicates(subset=['Year', 'Geo', 'StatVar'],
                              keep='first',
                              inplace=True)

diff --git a/.../us_cdc/natality/county/16-20_county.json → .../us_cdc/natality/county/16-22_county.json b/.../us_cdc/natality/county/16-20_county.json → .../us_cdc/natality/county/16-22_county.json
@@ -1,5 +1,5 @@
 {
-    "year_range": "16-20",
+    "year_range": "16-22",
     "data_cols": [
         "Births",
         "Average Age of Mother (years)",
@@ -160,6 +160,9 @@
             "15": {
                 "mothersAge": "[15 Years]"
             },
+            "15-": {
+                "mothersAge": "[15 - Years]"
+            },
             "16": {
                 "mothersAge": "[16 Years]"
             },
@@ -187,6 +190,9 @@
             "24": {
                 "mothersAge": "[24 Years]"
             },
+            "20-24": {
+                "mothersAge": "[20 24 Years]"
+            },
             "25": {
                 "mothersAge": "[25 Years]"
             },
@@ -202,6 +208,9 @@
             "29": {
                 "mothersAge": "[29 Years]"
             },
+            "25-29": {
+                "mothersAge": "[25 29 Years]"
+            },
             "30": {
                 "mothersAge": "[30 Years]"
             },
@@ -217,6 +226,9 @@
             "34": {
                 "mothersAge": "[34 Years]"
             },
+            "30-34": {
+                "mothersAge": "[30 34 Years]"
+            },
             "35": {
                 "mothersAge": "[35 Years]"
             },
@@ -232,6 +244,9 @@
             "39": {
                 "mothersAge": "[39 Years]"
             },
+            "35-39": {
+                "mothersAge": "[35 39 Years]"
+            },
             "40": {
                 "mothersAge": "[40 Years]"
             },
@@ -247,6 +262,9 @@
             "44": {
                 "mothersAge": "[44 Years]"
             },
+            "40-44": {
+                "mothersAge": "[40 44 Years]"
+            },
             "45": {
                 "mothersAge": "[45 Years]"
             },
@@ -262,6 +280,9 @@
             "49": {
                 "mothersAge": "[49 Years]"
             },
+            "45-49": {
+                "mothersAge": "[45 49 Years]"
+            },
             "All": {},
             "50+": {
                 "mothersAge": "[50 - Years]"
@@ -284,10 +305,10 @@
                 "mothersEthnicity": "CDC_EthnicityUnknownOrNotStated"
             },
             "All": {},
-            "2135-2": {
+            "2135-...": {
                 "mothersEthnicity": "HispanicOrLatino"
             },
-            "2186-5": {
+            "2186-...": {
                 "mothersEthnicity": "NotHispanicOrLatino"
             }
         },

diff --git a/scripts/us_cdc/natality/preprocess.py b/scripts/us_cdc/natality/preprocess.py
@@ -97,14 +97,18 @@ def _process_file(file_name: str, csv_reader: csv.DictReader,
     # Add key values based on filename
     for file_kv in file_name.split('$$'):
         key, val = file_kv.split('=')
+        # To bring year as header and not a pv
+        if key == "year":
+            continue
+        print(key, val)
         if val in _CONFIG['filename'][key]:
             update_d.update(_CONFIG['filename'][key][val])
         else:
             return statvars  # Skip processing this file, statvars is empty
 
         # Average age does not make sense when mothersAge is a cprop
         if key == 'MAge' and val != 'All':
-            if _CONFIG['year_range'] == '16-20':
+            if _CONFIG['year_range'] == '16-22':
                 data_cols.remove('Average Age of Mother (years)')
             elif _CONFIG['year_range'] in ['07-20', '03-06']:
                 data_cols.remove('Average Age of Mother')

diff --git a/scripts/us_cdc/natality/preprocess_test.py b/scripts/us_cdc/natality/preprocess_test.py
@@ -29,7 +29,7 @@ def test_csv(self):
             mcf_path = os.path.join(tmp_dir, 'output.mcf')
             preprocess_path = os.path.join(_SCRIPT_PATH, 'preprocess.py')
             config_path = os.path.join(_SCRIPT_PATH, 'state',
-                                       '16-20_state.json')
+                                       '16-22_state.json')
             input_path = os.path.join(_SCRIPT_PATH, 'testdata', 'cleaned_data')
 
             subprocess.call([

diff --git a/scripts/us_cdc/natality/scrape/README.md b/scripts/us_cdc/natality/scrape/README.md
@@ -0,0 +1,35 @@
+# Download process for CDC Wonder Natality
+
+Source url: `https://wonder.cdc.gov/natality-expanded-current.html`
+
+## Requirements
+Run the following commands to install the required packages
+
+- `pip3 install -r requirements.txt`
+
+- `pip3 install -r scripts/us_cdc/scrape/requirements.txt` (contains selenium)
+
+## Configuration steps
+There are certain options to be selected to download the required data
+
+### Web page
+-  Inspect the page -> Elements
+
+### natality_config.json
+- `group_by`: Select the `Geo level` and `year` for every file.
+- `measure`: Select the required datasets for example: `Births`, `Birth Rate`, `Fertility Rate` etc. Select the respective `ids` of the datasets.
+- `select`: Start selecting additional properties - age, race, education, marital status, ethnicity, nativity.
+- If the property is a radio button, give "`radio`" under that id. Unwanted values in a property like "`Exc, NR`" can be excluded - "`exclude_options`"
+- While selecting the properties, "`name`" can be given under their 
+respective `ids` so that it appears in the file name.
+
+## Executing the download script
+- For County data.
+```shell
+ python3 scripts/us_cdc/scrape/scrape_cdc_wonder.py --alsologtostderr --download_path=scripts/us_cdc/natality/county/raw_data/   --website=https://wonder.cdc.gov/natality-expanded-current.html   --config_path=scripts/us_cdc/scrape/natality_config.json --parallel_download=0 --headless   --calculate_combinations --show_totals --include_zeroes
+```
+
+- For State data.
+```shell
+ python3 scripts/us_cdc/scrape/scrape_cdc_wonder.py --alsologtostderr --download_path=scripts/us_cdc/natality/state/raw_data/   --website=https://wonder.cdc.gov/natality-expanded-current.html   --config_path=scripts/us_cdc/scrape/natality_config.json --parallel_download=0 --headless   --calculate_combinations --show_totals --include_zeroes
+```
diff --git a/scripts/us_cdc/natality/scrape/natality_config.json b/scripts/us_cdc/natality/scrape/natality_config.json
@@ -0,0 +1,52 @@
+{
+  "group_by": {
+    "SB_1": "D149.V21-level2",
+    "SB_2": "D149.V20"
+  },
+  "measure": [
+    "CM_002",
+    "CM_070",
+    "CM_080",
+    "CM_090",
+    "CM_095",
+    "CM_100",
+    "CM_110",
+    "CM_120",
+    "CM_130"
+  ],
+  "select": {
+    "RO_ageD149.V39": {
+      "radio":"RO_ageD149.V39"
+    },
+    "SD149.V39":{
+      "name":"MAge"
+    },
+    "SD149.V5": {
+      "name": "MEdu",
+      "exclude_options": ["Exc", "NR"]
+    },
+    "SD149.V27": {
+      "name": "MtalStatus",
+      "exclude_options": ["NR"]
+    },
+    "RO_raceD149.V49":{
+      "radio":"RO_raceD149.V49"
+    },
+    "SD149.V49": {
+      "name": "MRace",
+      "exclude_options": ["100"]
+    },
+    "SD149.V43": {
+      "name": "MEth"
+    },
+    "SD149.V48":{
+      "name":"MNat",
+      "exclude_options":["10"]
+    }
+  },
+  "year":{
+    "_comment": "The source had data devided into sets ex:2003-2006. Downloaded only the latest year data set.",
+    "element": "SD149.V20",
+    "include_list":["2016","2017","2018","2019","2020","2021","2022"]
+  }
+}
diff --git a/scripts/us_cdc/natality/scrape/requirements.txt b/scripts/us_cdc/natality/scrape/requirements.txt
@@ -0,0 +1,3 @@
+# Installing selenium to download the raw data from website
+
+selenium==4.17.2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Installing selenium to download the raw data from website

		selenium==4.17.2