diff --git a/isp_eda.qmd b/isp_eda.qmd index 9834792..9ac8fdc 100644 --- a/isp_eda.qmd +++ b/isp_eda.qmd @@ -76,8 +76,8 @@ isp <- read.csv("data/isp.csv", has_satel = "logical", array_agg = "character")) -isp[["cnt_locations"]] <- isp[["cnt_locations"]] -isp[["cnt_locations_services"]] <- isp[["cnt_locations_services"]] +isp[["cnt_locations"]] <- as.numeric(isp[["cnt_locations"]]) +isp[["cnt_locations_services"]] <- as.numeric(isp[["cnt_locations_services"]]) table_with_options(isp) ``` @@ -95,7 +95,7 @@ sprintf("Number of unique brand name pre cleaning: %s", length(unique(isp[["bran if we remove all capitalization: ```{r} -sprintf("Number of unique brand name pre cleaning: %s", length(unique(tolower(isp[["brand_name"]])))) +sprintf("Number of unique brand name w/o caps: %s", length(unique(tolower(isp[["brand_name"]])))) ``` We can filter, on this release, 21 cases for brand name. @@ -106,5 +106,36 @@ We can filter, on this release, 21 cases for brand name. - provider_id can be wrong - brand name can be different +One case: +0003738655 130432 "EATEL Corp." 83537 86548 true true true false false {LA} +0009873712 131103 "EATEL Corp." 34494 34497 false true true false false {LA} +Other case: + +0002626984 130008 Acentek 47 47 true false true false false {MN} +0002626984 130008 ACENTEK 1395 1395 true false false false false {MN} +0002645927 130008 Acentek 19521 26636 true false true true false {IA,MN} + + +### Rules for problems: + +- less than 2 locations + +```{r} +#| label: less than 2 locations +isp[["quality"]] <- "Good" +isp[isp[["cnt_locations"]] < 2, "quality"] <- "few locations" +``` + +- More than one frn for a provider_id " + +```{r} +#| label: more than one frn for a provider +#isp[isp[["cnt_locations"]] < 2, "quality"] <- "few locations" +``` + + +- same frn but more than one provider_id + +- frn and/or provider_id with more than one row