From b5d465aca2b1ca5f9fb030994eb210fe09e1c22d Mon Sep 17 00:00:00 2001 From: Carolyn Au Date: Mon, 12 Oct 2020 10:14:54 -0700 Subject: [PATCH 1/2] Small text updates --- Gemfile.lock | 50 ++++++++++++++------------------- contributing/adding_datasets.md | 2 +- contributing/index.md | 6 ++-- data_model.md | 25 +++++++---------- index.md | 41 ++++++--------------------- 5 files changed, 44 insertions(+), 80 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 5f0d201fe..d870b6f44 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ GEM remote: https://rubygems.org/ specs: - activesupport (6.0.3.2) + activesupport (6.0.3.4) concurrent-ruby (~> 1.0, >= 1.0.2) i18n (>= 0.7, < 2) minitest (~> 5.1) @@ -19,7 +19,7 @@ GEM concurrent-ruby (1.1.7) dnsruby (1.61.4) simpleidn (~> 0.1) - em-websocket (0.5.1) + em-websocket (0.5.2) eventmachine (>= 0.12.9) http_parser.rb (~> 0.6.0) ethon (0.12.0) @@ -29,27 +29,25 @@ GEM faraday (1.0.1) multipart-post (>= 1.2, < 3) ffi (1.13.1) - font-awesome-sass (5.13.0) - sassc (>= 1.11) forwardable-extended (2.6.0) gemoji (3.0.1) - github-pages (207) + github-pages (209) github-pages-health-check (= 1.16.1) jekyll (= 3.9.0) jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) jekyll-commonmark-ghpages (= 0.1.6) jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.13.0) + jekyll-feed (= 0.15.1) jekyll-gist (= 1.5.0) jekyll-github-metadata (= 2.13.0) - jekyll-mentions (= 1.5.1) + jekyll-mentions (= 1.6.0) jekyll-optional-front-matter (= 0.3.2) jekyll-paginate (= 1.1.0) jekyll-readme-index (= 0.3.0) - jekyll-redirect-from (= 0.15.0) + jekyll-redirect-from (= 0.16.0) jekyll-relative-links (= 0.6.1) - jekyll-remote-theme (= 0.4.1) + jekyll-remote-theme (= 0.4.2) jekyll-sass-converter (= 1.5.2) jekyll-seo-tag (= 2.6.1) jekyll-sitemap (= 1.4.0) @@ -57,7 +55,7 @@ GEM jekyll-theme-architect (= 0.1.1) jekyll-theme-cayman (= 0.1.1) jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.1) + jekyll-theme-hacker (= 0.1.2) jekyll-theme-leap-day (= 0.1.1) jekyll-theme-merlot (= 0.1.1) jekyll-theme-midnight (= 0.1.1) @@ -68,14 +66,14 @@ GEM jekyll-theme-tactile (= 0.1.1) jekyll-theme-time-machine (= 0.1.1) jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.11.1) + jemoji (= 0.12.0) kramdown (= 2.3.0) kramdown-parser-gfm (= 1.1.0) liquid (= 4.0.3) mercenary (~> 0.3) minima (= 2.5.1) nokogiri (>= 1.10.4, < 2.0) - rouge (= 3.19.0) + rouge (= 3.23.0) terminal-table (~> 1.4) github-pages-health-check (1.16.1) addressable (~> 2.3) @@ -116,17 +114,14 @@ GEM rouge (>= 2.0, < 4.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) - jekyll-feed (0.13.0) + jekyll-feed (0.15.1) jekyll (>= 3.7, < 5.0) - jekyll-font-awesome-sass (0.1.1) - font-awesome-sass (>= 4) - jekyll (>= 2.5, < 4.0) jekyll-gist (1.5.0) octokit (~> 4.2) jekyll-github-metadata (2.13.0) jekyll (>= 3.4, < 5.0) octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.5.1) + jekyll-mentions (1.6.0) html-pipeline (~> 2.3) jekyll (>= 3.7, < 5.0) jekyll-optional-front-matter (0.3.2) @@ -134,14 +129,15 @@ GEM jekyll-paginate (1.1.0) jekyll-readme-index (0.3.0) jekyll (>= 3.0, < 5.0) - jekyll-redirect-from (0.15.0) + jekyll-redirect-from (0.16.0) jekyll (>= 3.3, < 5.0) jekyll-relative-links (0.6.1) jekyll (>= 3.3, < 5.0) - jekyll-remote-theme (0.4.1) + jekyll-remote-theme (0.4.2) addressable (~> 2.0) jekyll (>= 3.5, < 5.0) - rubyzip (>= 1.3.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) jekyll-sass-converter (1.5.2) sass (~> 3.4) jekyll-seo-tag (2.6.1) @@ -158,8 +154,8 @@ GEM jekyll-theme-dinky (0.1.1) jekyll (~> 3.5) jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.1) - jekyll (~> 3.5) + jekyll-theme-hacker (0.1.2) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) jekyll-theme-leap-day (0.1.1) jekyll (~> 3.5) @@ -193,7 +189,7 @@ GEM jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.11.1) + jemoji (0.12.0) gemoji (~> 3.0) html-pipeline (~> 2.2) jekyll (>= 3.0, < 5.0) @@ -211,7 +207,7 @@ GEM jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.14.1) + minitest (5.14.2) multipart-post (2.1.1) nokogiri (1.10.10) mini_portile2 (~> 2.4.0) @@ -225,7 +221,7 @@ GEM rb-inotify (0.10.1) ffi (~> 1.0) rexml (3.2.4) - rouge (3.19.0) + rouge (3.23.0) ruby-enum (0.8.0) i18n rubyzip (2.3.0) @@ -235,8 +231,6 @@ GEM sass-listen (4.0.0) rb-fsevent (~> 0.9, >= 0.9.4) rb-inotify (~> 0.9, >= 0.9.7) - sassc (2.4.0) - ffi (~> 1.9) sawyer (0.8.2) addressable (>= 2.3.5) faraday (> 0.8, < 2.0) @@ -261,9 +255,7 @@ PLATFORMS DEPENDENCIES github-pages jekyll-feed (~> 0.6) - jekyll-font-awesome-sass jekyll-redirect-from - jemoji BUNDLED WITH 2.1.4 diff --git a/contributing/adding_datasets.md b/contributing/adding_datasets.md index 19cffb3f6..fb3ba307a 100644 --- a/contributing/adding_datasets.md +++ b/contributing/adding_datasets.md @@ -15,7 +15,7 @@ If you are seeking to contribute highly structured and clean data to the Data Co ### Cleaning the CSV -Sometimes the CSV needs some processing before it can be imported. There are no restrictions on your approach for this step, only requirements for its result. +Sometimes the CSV needs processing before it can be imported. There are no restrictions on your approach for this step, only requirements for its result. 1. Each [`StatisticalVariable`](https://datacommons.org/browser/StatisticalVariable) must have its own column for its observed value. 1. Each property in your schema must have its own column for its value, including the values of [`observationAbout`](https://datacommons.org/browser/observationAbout) and [`observationDate`](https://datacommons.org/browser/observationDate). ([`observationPeriod`](https://datacommons.org/browser/observationPeriod) is also helpful) diff --git a/contributing/index.md b/contributing/index.md index f75253558..107096d65 100644 --- a/contributing/index.md +++ b/contributing/index.md @@ -6,10 +6,10 @@ has_children: true --- # Contribute to Data Commons! - Data Commons has benefited greatly from our collaborations with different government organizations and academic institutions and are looking to expand the set of collaborative projects. In particular, we are looking for partner for: +Data Commons has benefited greatly from our collaborations with different government organizations and academic institutions and are looking to expand the set of collaborative projects. In particular, we are looking for partner for: -- [Create tools](#creating-a-new-tool): Build new tools or applications that bring the data in data commons to new categories of users. -- [Create new Curriculum](#sharing-analysis): Using Data Commons in data science and machine learning courses. +- [Create tools](#creating-a-new-tool): Build new tools or applications that bring the data in Data Commons to new categories of users. +- [Create new curriculum](#sharing-analysis): Using Data Commons in data science and machine learning courses. - [Write documentation](#updating-documentation) diff --git a/data_model.md b/data_model.md index 46adc8143..b5449c575 100644 --- a/data_model.md +++ b/data_model.md @@ -1,30 +1,25 @@ --- layout: default -title: Data Model -nav_order: 2 ---- ---- -layout: default title: Data Models nav_order: 2 --- -##Data models - The data included in data commons, even today, covers a wider range of domains, ranging from time series about demographics and employment to hurricanes to election results to protein structures. There is an inherent tension between using domain specific data models versus a more expressive but likely verbose data model capable of covering the breadth of domains we hope to have in data commons. +# Data models +The data included in Data Commons, even today, covers a wider range of domains: ranging from time series about demographics and employment, to hurricanes, to election results, and to protein structures. There is an inherent tension between using domain specific data models versus a more expressive, but likely verbose, data model capable of covering the breadth of domains we hope to have in Data Commons. - It is important for there to have an underlying model capable of expressing the breadth of data we might have about a single entity. For example, consider a place (such as Cook County, IL). There are time series about this place (related to demographics, jobs, etc.), there are specific events (like winter storms), information about historic events, etc. We would like a single uniform schema and query API for a client to access all this data. At the same time, for many applications that access a narrower slice of the data, it would be convenient to use data models that enable more compact encodings and/or more standard query languages such as SQL. To accomplish this, we use an expressive though verbose ‘base’ representation layer into which everything can be mapped. And on top of this we layer APIs which provide alternate views of the data in more specific data models. +It is important to have an underlying model capable of expressing the breadth of data we might have about a single entity. For example, consider a place such as [Cook County, IL](http://datacommons.org/place/geoId/17031). There are time series about this place (related to demographics, jobs, etc.), there are specific events (like winter storms), information about historic events, etc. We would like a single uniform schema and query API for a client to access all this data. At the same time, for many applications that access a narrower slice of the data, it would be convenient to use data models that enable more compact encodings and/or more standard query languages such as SQL. To accomplish this, we use an expressive though verbose 'base' representation layer into which everything can be mapped. And on top of this we layer APIs which provide alternate views of the data in more specific data models. Data Commons also provides access to the data in the following different views: -The data model for the base layer is the one used by schema.org . This models the world as a set of entities, with attributes and relationships between entities. There is a taxonomy of entities and each entity is an instance of at least one of the types in the taxonomy. The types and relations types are also entities. This kind of structure has its origins in knowledge representation systems such as KRL and Cyc and has recently found adoption under the name “knowledge graph”. The node api and sparql apis provide access to this view. The KG browser (raw graph view) allows one to browse through data commons in this view -Time series view provides a set of time series for combinations of entities and variables (StatVars, in data commons parlance). The DCGet api provides API access to this view of the data and the timeline tool allows one to browse data commons in this view -The relational view provides access to a subset of the data commons data as a set of relational tables in Big Query (coming soon). This makes it easier for users to combine their data with data from Data Commons. +1. The data model for the base layer is the one used by [Schema.org/](https://schema.org). This models the world as a set of entities, with attributes and relationships between entities. There is a taxonomy of entities and each entity is an instance of at least one of the types in the taxonomy. The types and relations types are also entities. This kind of structure has its origins in knowledge representation systems such as KRL and Cyc, and has recently found adoption under the name 'knowledge graph'. The [Node and SPARQL APIs](/api) provide access to this view. The [Data Commons Graph Browser](https://datacommons.org/browser) allows one to browse through Data Commons in this raw graph view +1. Time series view provides a set of time series for combinations of entities and variables ([Statistical Varables](https://datacommons.org/browser/StatisticalVariable), in Data Commons parlance). The [DCGet API](/api/sheets/get_variable.html) provides API access to this view of the data and the [Data Commons Timelines tool](https://datacommons.org/tools/timelines) allows one to browse Data Commons in this view. +1. The relational view provides access to a subset of the Data Commons data as a set of relational tables in Big Query (coming soon). This makes it easier for users to combine their data with data from Data Commons. -##Schemas - A single schema (to the extent possible) for all the data is one of data common’s main goals. We would like this schema to be ‘web friendly’ in the sense that it is an extension of some of the most widely used schemas on the web for structured data. To this end, Data Commons is built on top of Schema.org. We make heavy use of some of Schema.org term (notably StatisticalPopulation and Observation) and extend Schema.org as required, introducing both general constructs (such as Intervals) and values for common attribute values (e.g., Ethnicities, EducationalAttainments, etc.). +## Schemas +A single schema (to the extent possible) for all the data is one of Data Common’s main goals. We would like this schema to be 'web friendly' in the sense that it is an extension of some of the most widely used schemas on the web for structured data. To this end, Data Commons is built on top of [https://schema.org](Schema.org). We make heavy use of some of Schema.org's terms (notably [StatisticalPopulation](https://schema.org/StatisticalPopulation) and [Observation](https://schema.org/Observation) and extend Schema.org as required, introducing both general constructs (such as Intervals) and values for common attribute values (e.g., [Ethnicities](http://browser.datacommons.org/browser/race), [EducationalAttainments](http://browser.datacommons.org/browser/educationalAttainment), etc.). -##CrossWalks - A significant part of the work in building Data Commons is in aligning terms used to refer to the same or overlapping concepts across different datasets. Certain kinds of terms have widely shared meaning, e.g., age, life expectancy. Others, such as educational attainment are measured differently across different regions. Sometimes, different data sets about the same topic will use slightly different definitions of a term (e.g., BLS vs Census on the definition of what it means to be employed) and in some cases, the same dataset might even change its definition over time. Even in these cases, for many applications that aim to perform comparisons, it is useful to have mappings or aggregations between these different terminologies. +## CrossWalks +A significant part of the work in building Data Commons is in aligning terms used to refer to the same or overlapping concepts across different datasets. Certain kinds of terms have widely shared meaning, e.g., [age](http://browser.datacommons.org/browser/age), [life expectancy](http://browser.datacommons.org/browser/lifeExpectancy). Others, such as [educational attainment](http://browser.datacommons.org/browser/educationalAttainment) are measured differently across different regions. Sometimes, different data sets about the same topic will use slightly different definitions of a term (e.g., [BLS](https://www.bls.gov/bls/employment.htm) vs [Census](https://www.census.gov/topics/employment.html) on the definition of what it means to be employed) and in some cases, the same dataset might even change its definition over time. Even in these cases, for many applications that aim to perform comparisons, it is useful to have mappings or aggregations between these different terminologies. In Data Commons, to the extent possible, we preserve the original encodings. We also introduce new derived attributes/time series that capture mappings. We hope that this will enable useful applications for end users, while preserving the ability for researchers to explore implications of alternate mappings. diff --git a/index.md b/index.md index 0ee049306..7db4f8c52 100644 --- a/index.md +++ b/index.md @@ -1,46 +1,23 @@ --- layout: default -title: About Data Commons +title: Why Data Commons nav_order: 1 --- -#Why Data Commons? +# Why Data Commons? - Data underlies everything, from science and public policy to journalism, but often, the data does not get used as much as it should be. The problem often (not always) is not the lack of data. There is a substantial amount of data that is publicly available, most notably from organizations such as the World Bank, Census Bureaus of different countries, CDC, etc. The problem is the difficulty in using the data. +Data underlies everything, from science and public policy to journalism, but often, the data does not get used as much as it should be. The problem often (not always) is not the lack of data. There is a substantial amount of data that is publicly available, most notably from organizations such as the World Bank, Census Bureaus of different countries, CDC, etc. The problem is the difficulty in using the data. - Unfortunately, though the data is open, using it to answer specific - questions often involves tedious 'foraging' --- finding the data, cleaning - the data, reconciling different formats and schemas, figuring out how to merge - data about the same entity from different sources, etc. This error prone - and tedious process is repeated, once (or more) by each organization. - This is a problem in almost every area of study involving data, from the social - sciences and physical sciences to public policy. +Unfortunately, though the data is open, using it to answer specific questions often involves tedious 'foraging' --- finding the data, cleaning the data, reconciling different formats and schemas, figuring out how to merge data about the same entity from different sources, etc. This error prone and tedious process is repeated, once (or more) by each organization. This is a problem in almost every area of study involving data, from the social sciences and physical sciences to public policy. +Data Commons is an attempt to ameliorate some of this tedium by doing this once, on a large scale and providing cloud accessible APIs to the cleaned, normalized and joined data. While there are millions of datasets and it will be a while before Data Commons includes a substantial fraction of them, in every domain, some collections of data get used more frequently than others. We have started with a core set of these in the hope that useful applications can be built on top of them. -Data Commons is an attempt to ameliorate some of this tedium by doing this once, on a large scale and providing cloud accessible APIs to the cleaned, normalized and joined data. While there are millions of datasets and it will be a while before Data Commons includes a substantial fraction of them, in every domain, some collections of data get used more frequently than others. We have started with a core set of these in the hope that useful applications can be built on top of them. +One of the salient aspects of Data Commons is that it is not a repository of data sets. There are many great repositories (dataverse, BQ public datasets, data.gov) that more than adequately address that need. Instead, it is a single unified database created by normalizing/aligning the schemas and entity references across these different datasets (to the extent possible). So, for example, if a researcher wants the population, violent crime rate and unemployment rate of a county, the researcher does not have to go to three different datasets (Census, FBI and BLS), but can instead, get it from a single database, using one schema, one API. Of course, she would want to know the provenance of the data, which is recorded with every data point, something enabled in the APIs. -One of the salient aspects of Data Commons is that it is not a -repository of data sets. There are many great repositories (dataverse, -BQ public datasets, data.gov) that more than adequately address that -need. Instead, it is a single unified database created by -normalizing/aligning the schemas and entity references across these -different datasets (to the extent possible). So, for example, if a -researcher wants the population, violent crime rate and unemployment -rate of a county, the researcher does not have to go to three -different datasets (Census, FBI and BLS), but can instead, get it from -a single database, using one schema, one API. Of course, she would -want to know the provenance of the data, which is recorded with every -data point, something enabled in the APIs. +Data Commons has benefited greatly from our collaborations with different government organizations and academic institutions and are looking to expand the set of collaborative projects. In particular, we are looking for partners for: - - - - Data Commons has benefited greatly from our collaborations with different government organizations and academic institutions and are looking to expand the set of collaborative projects. In particular, we are looking for partners for: - -1. Building new applications that bring the data in data commons to new categories of users. +1. Building new applications that bring the data in Data Commons to new categories of users. 1. Using Data Commons in data science and machine learning courses. 1. Tools for visualization, data cleaning, etc. -We are also looking for a small number of partnerships with -government, NGO and academic organizations for growing the core of -data commons with data about more topics and more regions of +We are also looking for a small number of partnerships with government, NGO and academic organizations for growing the core of Data Commons with data about more topics and more regions of the world. From e8e901a29cf11df66c14421da41d3238e35f1500 Mon Sep 17 00:00:00 2001 From: Carolyn Au Date: Mon, 12 Oct 2020 10:37:55 -0700 Subject: [PATCH 2/2] remove duplicate contribution text: --- index.md | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/index.md b/index.md index 7db4f8c52..581a85d0d 100644 --- a/index.md +++ b/index.md @@ -12,12 +12,4 @@ Unfortunately, though the data is open, using it to answer specific questions of Data Commons is an attempt to ameliorate some of this tedium by doing this once, on a large scale and providing cloud accessible APIs to the cleaned, normalized and joined data. While there are millions of datasets and it will be a while before Data Commons includes a substantial fraction of them, in every domain, some collections of data get used more frequently than others. We have started with a core set of these in the hope that useful applications can be built on top of them. -One of the salient aspects of Data Commons is that it is not a repository of data sets. There are many great repositories (dataverse, BQ public datasets, data.gov) that more than adequately address that need. Instead, it is a single unified database created by normalizing/aligning the schemas and entity references across these different datasets (to the extent possible). So, for example, if a researcher wants the population, violent crime rate and unemployment rate of a county, the researcher does not have to go to three different datasets (Census, FBI and BLS), but can instead, get it from a single database, using one schema, one API. Of course, she would want to know the provenance of the data, which is recorded with every data point, something enabled in the APIs. - -Data Commons has benefited greatly from our collaborations with different government organizations and academic institutions and are looking to expand the set of collaborative projects. In particular, we are looking for partners for: - -1. Building new applications that bring the data in Data Commons to new categories of users. -1. Using Data Commons in data science and machine learning courses. -1. Tools for visualization, data cleaning, etc. - -We are also looking for a small number of partnerships with government, NGO and academic organizations for growing the core of Data Commons with data about more topics and more regions of the world. +One of the salient aspects of Data Commons is that it is not a repository of data sets. There are many great repositories (dataverse, BQ public datasets, data.gov) that more than adequately address that need. Instead, it is a single unified database created by normalizing/aligning the schemas and entity references across these different datasets (to the extent possible). So, for example, if a researcher wants the population, violent crime rate and unemployment rate of a county, the researcher does not have to go to three different datasets (Census, FBI and BLS), but can instead, get it from a single database, using one schema, one API. Of course, she would want to know the provenance of the data, which is recorded with every data point, something enabled in the APIs. \ No newline at end of file