Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: add string length stats to set stage for upcoming outliers "smart" command to quickly identify outliers using stats/frequency info #2390

Merged
merged 12 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 35 additions & 35 deletions resources/test/boston311-10-boolean-1or0-stats.csv
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value
case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10,
open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,,,0,,0,10,
target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,,,4,,0.4,6,
closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,,,5,,0.5,6,
ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,,,0,,0,2,
case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,,,0,,0,2,
case_status_boolean,Boolean,,5,0,1,1,Unsorted,1,1,10,1,0.5,0.1581,0,,0.5,0.25,100,0,,0,2,
closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,,,0,,0,6,
case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,,,0,,0,8,
subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,,,0,,0,5,
reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,,,0,,0,7,
type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,,,0,,0,8,
queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,,,0,,0,7,
department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,,,0,,0,5,
submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1,
closedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1,
location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,,,0,,0,10,
fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,,,0,,0,4,
pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,,,0,,0,6,
city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,,,0,,0,6,
police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,,,0,,0,6,
neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,,,0,,0,8,
neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,,,0,,0,7,
ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,,,0,,0,8,
precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,,,0,,0,9,
location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,,,1,,0.1,10,
location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,,,1,,0.1,8,
latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9,
longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10,
source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,,,0,,0,2,
qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,10
qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,30
qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,3887
qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,1a4c2204a401f6791b6e5efde990955e1b6c59aec5b3de300686fadb63ee457b
field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value
case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,,,,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10,
open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,0,0,0,,,,,,,,0,,0,10,
target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,9.3081,86.64,0.8165,,,,,,,,4,,0.4,6,
closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,9.4412,89.1358,0.9938,,,,,,,,5,,0.5,6,
ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,0.4,0.16,0.0645,,,,,,,,0,,0,2,
case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,1,1,0.2,,,,,,,,0,,0,2,
case_status_boolean,Boolean,,5,0,1,1,Unsorted,1,1,10,1,,,,0.5,0.1581,0,,0.5,0.25,100,0,,0,2,
closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,34.5543,1194,0.9873,,,,,,,,0,,0,6,
case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,14.1156,199.25,0.6007,,,,,,,,0,,0,8,
subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,2.6552,7.05,0.113,,,,,,,,0,,0,5,
reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,7.9019,62.44,0.4541,,,,,,,,0,,0,7,
type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,11.619,135,0.4841,,,,,,,,0,,0,8,
queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,10.1272,102.56,0.3723,,,,,,,,0,,0,7,
department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,0.4,0.16,0.1053,,,,,,,,0,,0,5,
submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1,
closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1,
location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,10.4062,108.29,0.3368,,,,,,,,0,,0,10,
fire_district,String,true,, ,9,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,4,
pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,0.3,0.09,0.1579,,,,,,,,0,,0,6,
city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,6,
police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,0.5385,0.29,0.2564,,,,,,,,0,,0,6,
neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,3.2696,10.69,0.3593,,,,,,,,0,,0,8,
neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,0.4899,0.24,0.3499,,,,,,,,0,,0,7,
ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,1.9519,3.81,0.3683,,,,,,,,0,,0,8,
precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,0.9,0.81,0.2432,,,,,,,,0,,0,9,
location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,2.7889,7.7778,0.2324,,,,,,,,1,,0.1,10,
location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,0,0,0,,,,,,,,1,,0.1,8,
latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,,,,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9,
longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,,,,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10,
source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,0.4583,0.21,0.0292,,,,,,,,0,,0,2,
qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,10
qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,30
qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,3887
qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,71b0f8ff9ddfe2ed63633fd0f29bddaadd1613d73b622b54b3be54c6dea56b0d
70 changes: 35 additions & 35 deletions resources/test/boston311-10-boolean-tf-stats.csv
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value
case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10,
open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,,,0,,0,10,
target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,,,4,,0.4,6,
closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,,,5,,0.5,6,
ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,,,0,,0,2,
case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,,,0,,0,2,
case_status_boolean,Boolean,true,,False,True,,Unsorted,4,5,45,4.5,,,,,,,,0,,0,2,
closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,,,0,,0,6,
case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,,,0,,0,8,
subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,,,0,,0,5,
reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,,,0,,0,7,
type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,,,0,,0,8,
queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,,,0,,0,7,
department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,,,0,,0,5,
submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1,
closedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1,
location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,,,0,,0,10,
fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,,,0,,0,4,
pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,,,0,,0,6,
city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,,,0,,0,6,
police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,,,0,,0,6,
neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,,,0,,0,8,
neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,,,0,,0,7,
ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,,,0,,0,8,
precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,,,0,,0,9,
location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,,,1,,0.1,10,
location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,,,1,,0.1,8,
latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9,
longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10,
source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,,,0,,0,2,
qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,10
qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,30
qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,3922
qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,dd97ad46b4b34efa66aa634d6c54188eebaf44ef5aaa5dde38180c3435a9ddaa
field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value
case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,,,,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10,
open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,0,0,0,,,,,,,,0,,0,10,
target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,9.3081,86.64,0.8165,,,,,,,,4,,0.4,6,
closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,9.4412,89.1358,0.9938,,,,,,,,5,,0.5,6,
ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,0.4,0.16,0.0645,,,,,,,,0,,0,2,
case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,1,1,0.2,,,,,,,,0,,0,2,
case_status_boolean,Boolean,true,,False,True,,Unsorted,4,5,45,4.5,0.5,0.25,0.1111,,,,,,,,0,,0,2,
closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,34.5543,1194,0.9873,,,,,,,,0,,0,6,
case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,14.1156,199.25,0.6007,,,,,,,,0,,0,8,
subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,2.6552,7.05,0.113,,,,,,,,0,,0,5,
reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,7.9019,62.44,0.4541,,,,,,,,0,,0,7,
type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,11.619,135,0.4841,,,,,,,,0,,0,8,
queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,10.1272,102.56,0.3723,,,,,,,,0,,0,7,
department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,0.4,0.16,0.1053,,,,,,,,0,,0,5,
submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1,
closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1,
location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,10.4062,108.29,0.3368,,,,,,,,0,,0,10,
fire_district,String,true,, ,9,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,4,
pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,0.3,0.09,0.1579,,,,,,,,0,,0,6,
city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,6,
police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,0.5385,0.29,0.2564,,,,,,,,0,,0,6,
neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,3.2696,10.69,0.3593,,,,,,,,0,,0,8,
neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,0.4899,0.24,0.3499,,,,,,,,0,,0,7,
ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,1.9519,3.81,0.3683,,,,,,,,0,,0,8,
precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,0.9,0.81,0.2432,,,,,,,,0,,0,9,
location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,2.7889,7.7778,0.2324,,,,,,,,1,,0.1,10,
location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,0,0,0,,,,,,,,1,,0.1,8,
latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,,,,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9,
longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,,,,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10,
source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,0.4583,0.21,0.0292,,,,,,,,0,,0,2,
qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,10
qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,30
qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,3922
qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,874abe7cd02691b113acc7122097731ef6011f9e8e96dfd63ebbddc6724d19ef
Loading
Loading