diff --git a/.coverage b/.coverage index 6463e1a..b0f5e6d 100644 Binary files a/.coverage and b/.coverage differ diff --git a/coverage.xml b/coverage.xml index 386fc1c..a517fac 100644 --- a/coverage.xml +++ b/coverage.xml @@ -1,12 +1,12 @@ - + - /Users/jvivian/Library/CloudStorage/GoogleDrive-jtvivian@gmail.com/My Drive/projects/covid19-drDFM/covid19_drdfm + /home/jvivian/covid19-drDFM/covid19_drdfm - + @@ -42,7 +42,7 @@ - + @@ -52,67 +52,69 @@ - - + + - - - - - - - + + + + + + - + + - - - - - - + + + + + - + + - - - + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + + - - - - + + + + - - - - - + + + + - - - + + + + + + @@ -134,48 +136,49 @@ + + - - - - - - - - - + + + + + + + + + - + - + - - - - - - - - - - + + + + + + + + + - - - - + + + + + - diff --git a/covid19_drdfm/constants.py b/covid19_drdfm/constants.py index 8f81179..d25f57a 100644 --- a/covid19_drdfm/constants.py +++ b/covid19_drdfm/constants.py @@ -33,7 +33,7 @@ "Demand_5": "Cons5", "Demand_6": "Employment1", "Demand_7": "Employment2", - "Supply_1": "GDP", + "GDP": "GDP", "Supply_2": "UI", "Supply_3": "PartR", "Supply_4": "UR", diff --git a/covid19_drdfm/dfm.py b/covid19_drdfm/dfm.py index 9fc880e..e4cc73a 100644 --- a/covid19_drdfm/dfm.py +++ b/covid19_drdfm/dfm.py @@ -13,6 +13,7 @@ from statsmodels.tsa.stattools import adfuller from covid19_drdfm.constants import FACTORS +from covid19_drdfm.processing import normalize @dataclass @@ -40,9 +41,12 @@ def state_process(df: pd.DataFrame, state: str) -> pd.DataFrame: df = df[df.State == state] #! The trunctation will be removed when data is updated in OCT - A.C. df = df[:-12] + #! Test double-norm + df = normalize(df).fillna(0) + #! TEST REMOVE const_cols = [x for x in df.columns if is_constant(df[x])] pprint(f"Constant Columns...dropping\n{const_cols}") - df = df.drop(columns=const_cols) + df = df.drop(columns=const_cols).set_index("Time", drop=True) return df diff --git a/covid19_drdfm/processing.py b/covid19_drdfm/processing.py index 8f1a22c..4491225 100644 --- a/covid19_drdfm/processing.py +++ b/covid19_drdfm/processing.py @@ -34,7 +34,6 @@ def get_df() -> pd.DataFrame: dfs = [pd.read_csv(x) for x in paths] return ( reduce(lambda x, y: pd.merge(x, y, on=["State", "Year", "Period"], how="left"), dfs) - .fillna(0) .drop(columns=["Monetary_1_x", "Monetary_11_x"]) .rename(columns={"Monetary_1_y": "Monetary_1", "Monetary_11_y": "Monetary_11"}) .drop( @@ -46,7 +45,6 @@ def get_df() -> pd.DataFrame: .pipe(adjust_pandemic_response) .pipe(diff_vars, cols=DIFF_COLS) .pipe(diff_vars, cols=LOG_DIFF_COLS, log=True) - .fillna(0) .pipe(normalize) .drop(index=0) # Drop first row with NaNs from diff ) @@ -90,7 +88,7 @@ def adjust_inflation(df: pd.DataFrame) -> pd.DataFrame: .assign(Demand_3=lambda x: x.Demand_3.div(x.Monetary_3 / 100)) .assign(Demand_4=lambda x: x.Demand_4.div(x.Monetary_3 / 100)) .assign(Demand_5=lambda x: x.Demand_5.div(x.Monetary_3 / 100)) - .assign(Supply_1=lambda x: x.Supply_1.div(x.Monetary_3 / 100)) + .assign(GDP=lambda x: x.GDP.div(x.Monetary_3 / 100)) .assign(Supply_6=lambda x: x.Supply_6.div(x.Monetary_3 / 100)) ) @@ -171,11 +169,12 @@ def normalize(df: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Normalized and stationary DataFrame """ - meta_cols = df[["State", "Time"]] + meta_cols = df[["State", "Time"]].copy().reset_index(drop=True) # df = df.drop(columns=["Time"]) if "Time" in df.columns else df df = df.drop(columns=["State", "Time"]) # Normalize data scaler = MinMaxScaler() new = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) new["State"] = meta_cols["State"] + new["Time"] = meta_cols["Time"] return new diff --git a/covid19_drdfm/streamlit/runner.py b/covid19_drdfm/streamlit/runner.py index b424f04..fa729a2 100644 --- a/covid19_drdfm/streamlit/runner.py +++ b/covid19_drdfm/streamlit/runner.py @@ -10,15 +10,15 @@ from rich import print as pprint from sklearn.preprocessing import MinMaxScaler +from covid19_drdfm.constants import FACTORS from covid19_drdfm.dfm import state_process -from covid19_drdfm.processing import get_df, get_factors -from covid19_drdfm.processing import NAME_MAP +from covid19_drdfm.processing import NAME_MAP, get_df, normalize st.set_page_config(layout="wide") pio.templates.default = "plotly_white" DEFAULTS = { - "Uncat": ["Monetary_5", "Monetary_9", "Monetary_10", "Supply_1", "Supply_7"], + "Uncat": ["Monetary_5", "Monetary_9", "Monetary_10", "GDP", "Supply_7"], "Consumption": ["Demand_3", "Demand_4", "Demand_5"], "Response": [ "Pandemic_Response_1", @@ -32,8 +32,8 @@ "Inflation": ["Monetary_2", "Monetary_3", "Monetary_1"], "Pandemic": ["Pandemic_1", "Pandemic_2", "Pandemic_6", "Pandemic_9", "Pandemic_7", "Pandemic_10"], } -DEFAULTS = {NAME_MAP[x]: [NAME_MAP[z] for z in y] for x, y in DEFAULTS.items() if x in NAME_MAP in NAME_MAP} -print(DEFAULTS) +DEFAULTS = {x: [NAME_MAP[z] for z in y] for x, y in DEFAULTS.items()} +# st.write(DEFAULTS) def center_title(text): @@ -55,10 +55,9 @@ def run_parameterized_model( """ # Factors and input data - factors = get_factors() factor_multiplicities = {"Global": global_multiplier} df = state_process(df, state) - columns = list(columns) + ["State", "Time"] + columns = list(columns) # + ["State", "Time"] columns = [x for x in columns if x in df.columns] new = df[columns] variables = list(factors.keys()) @@ -73,9 +72,9 @@ def run_parameterized_model( # Run Model if (out / "model.csv").exists(): return - model = sm.tsa.DynamicFactorMQ(new, factors=factors, factor_multiplicities=factor_multiplicities) + model = sm.tsa.DynamicFactorMQ(new, factors=FACTORS, factor_multiplicities=factor_multiplicities) try: - results = model.fit(disp=10, maxiter=5_000) + results = model.fit(disp=10, maxiter=10_000) except Exception as e: with open(outdir / "failed.txt", "a") as f: f.write(f"{state}\t{e}\n") @@ -86,7 +85,7 @@ def run_parameterized_model( f.write(results.summary().as_csv()) filtered = results.factors["filtered"] filtered["State"] = state - filtered.to_csv(out / "filtered-factors.csv") + filtered.to_csv(out / "filtered-factors.csv", index=None) return model @@ -97,7 +96,7 @@ def get_data(): df = get_df() sub = pd.Series([x for x in df.columns if x not in ["State", "Time"]], name="Variables").to_frame() -factors = get_factors() +factors = FACTORS.copy() factor_vars = list(factors.keys()) _ = [factors.pop(x) for x in factor_vars if x not in df.columns] sub["Group"] = [factors[x][1] for x in sub.Variables if x in df.columns] diff --git a/tests/test_dfm.py b/tests/test_dfm.py index 9f6a6c9..90a9e21 100644 --- a/tests/test_dfm.py +++ b/tests/test_dfm.py @@ -9,7 +9,8 @@ # TODO: output should go in a directory instead of dumping shit everywhere def test_run_model(): df = get_df() - run_model(df, "NY", Path("./testdir")) - assert Path("./testdir/NY/model.csv").exists() - assert Path("./testdir/NY/results.csv").exists() + state = "SD" + run_model(df, state, Path("./testdir")) + assert Path("./testdir/SD/model.csv").exists() + assert Path("./testdir/SD/results.csv").exists() shutil.rmtree("./testdir")