diff --git a/notebooks/3.1 Exploratory data analysis I.ipynb b/notebooks/3.1 Exploratory data analysis I.ipynb
deleted file mode 100644
index a2d9f86..0000000
--- a/notebooks/3.1 Exploratory data analysis I.ipynb
+++ /dev/null
@@ -1,1653 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 3.1 Basic of exploratory data analysis with š¼\n",
- "\n",
- "In this notebook, we will focus on another essential skill in data analysis, namely the ability to get insights about a dataset by means of plotting and summary statistics."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### `describe()`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 383,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " week_day \n",
- " day_hour \n",
- " n_mentions \n",
- " year \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " count \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " \n",
- " \n",
- " mean \n",
- " 3.196169 \n",
- " 12.782547 \n",
- " 0.807733 \n",
- " 2014.777226 \n",
- " \n",
- " \n",
- " std \n",
- " 1.946637 \n",
- " 7.611198 \n",
- " 0.859091 \n",
- " 1.687017 \n",
- " \n",
- " \n",
- " min \n",
- " 0.000000 \n",
- " 0.000000 \n",
- " 0.000000 \n",
- " 2010.000000 \n",
- " \n",
- " \n",
- " 25% \n",
- " 2.000000 \n",
- " 5.000000 \n",
- " 0.000000 \n",
- " 2013.000000 \n",
- " \n",
- " \n",
- " 50% \n",
- " 3.000000 \n",
- " 15.000000 \n",
- " 1.000000 \n",
- " 2015.000000 \n",
- " \n",
- " \n",
- " 75% \n",
- " 5.000000 \n",
- " 19.000000 \n",
- " 1.000000 \n",
- " 2016.000000 \n",
- " \n",
- " \n",
- " max \n",
- " 6.000000 \n",
- " 23.000000 \n",
- " 6.000000 \n",
- " 2017.000000 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " week_day day_hour n_mentions year\n",
- "count 2819.000000 2819.000000 2819.000000 2819.000000\n",
- "mean 3.196169 12.782547 0.807733 2014.777226\n",
- "std 1.946637 7.611198 0.859091 1.687017\n",
- "min 0.000000 0.000000 0.000000 2010.000000\n",
- "25% 2.000000 5.000000 0.000000 2013.000000\n",
- "50% 3.000000 15.000000 1.000000 2015.000000\n",
- "75% 5.000000 19.000000 1.000000 2016.000000\n",
- "max 6.000000 23.000000 6.000000 2017.000000"
- ]
- },
- "execution_count": 383,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# the default behavior is to include only\n",
- "# column with numerical values\n",
- "\n",
- "\n",
- "df.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# in this case fails as pandas does not know\n",
- "# how to handle a column with values of type list (fair enough)\n",
- "\n",
- "df.describe(include='all')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 386,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " created_at \n",
- " week_day \n",
- " day_hour \n",
- " n_mentions \n",
- " year \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " count \n",
- " 2819 \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " \n",
- " \n",
- " unique \n",
- " 2819 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " top \n",
- " 2013-02-21 06:48:55 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " freq \n",
- " 1 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " first \n",
- " 2010-06-04 18:31:57 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " last \n",
- " 2017-04-05 14:56:29 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " mean \n",
- " NaN \n",
- " 3.196169 \n",
- " 12.782547 \n",
- " 0.807733 \n",
- " 2014.777226 \n",
- " \n",
- " \n",
- " std \n",
- " NaN \n",
- " 1.946637 \n",
- " 7.611198 \n",
- " 0.859091 \n",
- " 1.687017 \n",
- " \n",
- " \n",
- " min \n",
- " NaN \n",
- " 0.000000 \n",
- " 0.000000 \n",
- " 0.000000 \n",
- " 2010.000000 \n",
- " \n",
- " \n",
- " 25% \n",
- " NaN \n",
- " 2.000000 \n",
- " 5.000000 \n",
- " 0.000000 \n",
- " 2013.000000 \n",
- " \n",
- " \n",
- " 50% \n",
- " NaN \n",
- " 3.000000 \n",
- " 15.000000 \n",
- " 1.000000 \n",
- " 2015.000000 \n",
- " \n",
- " \n",
- " 75% \n",
- " NaN \n",
- " 5.000000 \n",
- " 19.000000 \n",
- " 1.000000 \n",
- " 2016.000000 \n",
- " \n",
- " \n",
- " max \n",
- " NaN \n",
- " 6.000000 \n",
- " 23.000000 \n",
- " 6.000000 \n",
- " 2017.000000 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " created_at week_day day_hour n_mentions \\\n",
- "count 2819 2819.000000 2819.000000 2819.000000 \n",
- "unique 2819 NaN NaN NaN \n",
- "top 2013-02-21 06:48:55 NaN NaN NaN \n",
- "freq 1 NaN NaN NaN \n",
- "first 2010-06-04 18:31:57 NaN NaN NaN \n",
- "last 2017-04-05 14:56:29 NaN NaN NaN \n",
- "mean NaN 3.196169 12.782547 0.807733 \n",
- "std NaN 1.946637 7.611198 0.859091 \n",
- "min NaN 0.000000 0.000000 0.000000 \n",
- "25% NaN 2.000000 5.000000 0.000000 \n",
- "50% NaN 3.000000 15.000000 1.000000 \n",
- "75% NaN 5.000000 19.000000 1.000000 \n",
- "max NaN 6.000000 23.000000 6.000000 \n",
- "\n",
- " year \n",
- "count 2819.000000 \n",
- "unique NaN \n",
- "top NaN \n",
- "freq NaN \n",
- "first NaN \n",
- "last NaN \n",
- "mean 2014.777226 \n",
- "std 1.687017 \n",
- "min 2010.000000 \n",
- "25% 2013.000000 \n",
- "50% 2015.000000 \n",
- "75% 2016.000000 \n",
- "max 2017.000000 "
- ]
- },
- "execution_count": 386,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# that's a workaround to include all other columns\n",
- "\n",
- "df.describe(exclude=[list])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 320,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "count 2819\n",
- "unique 2819\n",
- "top 2013-02-21 06:48:55\n",
- "freq 1\n",
- "first 2010-06-04 18:31:57\n",
- "last 2017-04-05 14:56:29\n",
- "Name: created_at, dtype: object"
- ]
- },
- "execution_count": 320,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.created_at.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 387,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['week_day_name'] = df['week_day_name'].astype('category')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 389,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " created_at \n",
- " week_day \n",
- " day_hour \n",
- " n_mentions \n",
- " year \n",
- " week_day_name \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " count \n",
- " 2819 \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " 2819.000000 \n",
- " 2819 \n",
- " \n",
- " \n",
- " unique \n",
- " 2819 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 7 \n",
- " \n",
- " \n",
- " top \n",
- " 2013-02-21 06:48:55 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " Friday \n",
- " \n",
- " \n",
- " freq \n",
- " 1 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 530 \n",
- " \n",
- " \n",
- " first \n",
- " 2010-06-04 18:31:57 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " last \n",
- " 2017-04-05 14:56:29 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " mean \n",
- " NaN \n",
- " 3.196169 \n",
- " 12.782547 \n",
- " 0.807733 \n",
- " 2014.777226 \n",
- " NaN \n",
- " \n",
- " \n",
- " std \n",
- " NaN \n",
- " 1.946637 \n",
- " 7.611198 \n",
- " 0.859091 \n",
- " 1.687017 \n",
- " NaN \n",
- " \n",
- " \n",
- " min \n",
- " NaN \n",
- " 0.000000 \n",
- " 0.000000 \n",
- " 0.000000 \n",
- " 2010.000000 \n",
- " NaN \n",
- " \n",
- " \n",
- " 25% \n",
- " NaN \n",
- " 2.000000 \n",
- " 5.000000 \n",
- " 0.000000 \n",
- " 2013.000000 \n",
- " NaN \n",
- " \n",
- " \n",
- " 50% \n",
- " NaN \n",
- " 3.000000 \n",
- " 15.000000 \n",
- " 1.000000 \n",
- " 2015.000000 \n",
- " NaN \n",
- " \n",
- " \n",
- " 75% \n",
- " NaN \n",
- " 5.000000 \n",
- " 19.000000 \n",
- " 1.000000 \n",
- " 2016.000000 \n",
- " NaN \n",
- " \n",
- " \n",
- " max \n",
- " NaN \n",
- " 6.000000 \n",
- " 23.000000 \n",
- " 6.000000 \n",
- " 2017.000000 \n",
- " NaN \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " created_at week_day day_hour n_mentions \\\n",
- "count 2819 2819.000000 2819.000000 2819.000000 \n",
- "unique 2819 NaN NaN NaN \n",
- "top 2013-02-21 06:48:55 NaN NaN NaN \n",
- "freq 1 NaN NaN NaN \n",
- "first 2010-06-04 18:31:57 NaN NaN NaN \n",
- "last 2017-04-05 14:56:29 NaN NaN NaN \n",
- "mean NaN 3.196169 12.782547 0.807733 \n",
- "std NaN 1.946637 7.611198 0.859091 \n",
- "min NaN 0.000000 0.000000 0.000000 \n",
- "25% NaN 2.000000 5.000000 0.000000 \n",
- "50% NaN 3.000000 15.000000 1.000000 \n",
- "75% NaN 5.000000 19.000000 1.000000 \n",
- "max NaN 6.000000 23.000000 6.000000 \n",
- "\n",
- " year week_day_name \n",
- "count 2819.000000 2819 \n",
- "unique NaN 7 \n",
- "top NaN Friday \n",
- "freq NaN 530 \n",
- "first NaN NaN \n",
- "last NaN NaN \n",
- "mean 2014.777226 NaN \n",
- "std 1.687017 NaN \n",
- "min 2010.000000 NaN \n",
- "25% 2013.000000 NaN \n",
- "50% 2015.000000 NaN \n",
- "75% 2016.000000 NaN \n",
- "max 2017.000000 NaN "
- ]
- },
- "execution_count": 389,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.describe(exclude=['object'])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Plotting"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 334,
- "metadata": {},
- "outputs": [],
- "source": [
- "%matplotlib inline\n",
- "\n",
- "import matplotlib.pyplot as plt"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Histograms\n",
- "\n",
- "They are useful to see the distribution of a certain variable in your dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 184,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " text \n",
- " \n",
- " \n",
- " n_mentions \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 1145 \n",
- " \n",
- " \n",
- " 1 \n",
- " 1231 \n",
- " \n",
- " \n",
- " 2 \n",
- " 329 \n",
- " \n",
- " \n",
- " 3 \n",
- " 78 \n",
- " \n",
- " \n",
- " 4 \n",
- " 28 \n",
- " \n",
- " \n",
- " 5 \n",
- " 6 \n",
- " \n",
- " \n",
- " 6 \n",
- " 2 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " text\n",
- "n_mentions \n",
- "0 1145\n",
- "1 1231\n",
- "2 329\n",
- "3 78\n",
- "4 28\n",
- "5 6\n",
- "6 2"
- ]
- },
- "execution_count": 184,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.groupby(['n_mentions'])[['text']].count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 185,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "%matplotlib inline\n",
- "plt.figure(figsize=(10, 6))\n",
- "plt.hist(df.n_mentions, bins='auto', rwidth=1.0)\n",
- "plt.title('Distribution of the number of mentions per tweet')\n",
- "plt.ylabel(\"Tweets\")\n",
- "plt.xlabel(\"Mentions (per tweet)\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 187,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "%matplotlib inline\n",
- "plt.figure(figsize=(10, 6))\n",
- "plt.hist(df.day_hour, bins='auto', rwidth=0.6)\n",
- "plt.title('Distribution of the number of mentions per tweet')\n",
- "plt.ylabel(\"Tweets\")\n",
- "plt.xlabel(\"Hour of the day\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 188,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_2017 = df[df.created_at.dt.year == 2017]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 189,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "%matplotlib inline\n",
- "plt.figure(figsize=(10, 6))\n",
- "plt.hist(df_2017.day_hour, bins='auto', rwidth=0.6)\n",
- "plt.title('Year 2017')\n",
- "plt.ylabel(\"Tweets\")\n",
- "plt.xlabel(\"Hour of the day\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "So far we have used directly `matplotlib` to generate our plots.\n",
- "\n",
- "`pandas`'s dataframes provide some methods that directly call `matplotlib`'s API behind the scenes:\n",
- "- `hist()` for histograms\n",
- "- `boxplot()` for boxplots\n",
- "- `plot()` for other types of plots (specified with e.g. `any='scatter'`)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "By passing the `by` parameter to e.g. `hist()` it is possible to produce one histogram plot of a given variable for each value in another column."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's see how we can plot the number of mentions by year:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 335,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['year'] = df.created_at.dt.year"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 356,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "axes = df.hist(column='day_hour', by='year', figsize=(10,10))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Scatter plots\n",
- "\n",
- "They are useful to plot the relation between two variables in your dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 176,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " created_at \n",
- " text \n",
- " tweet_link \n",
- " week_day \n",
- " day_hour \n",
- " tweet_mentions \n",
- " n_mentions \n",
- " \n",
- " \n",
- " id \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 849636868052275200 \n",
- " 2017-04-05 14:56:29 \n",
- " b'And so the robots spared humanity ... https://t.co/v7JUJQWfCv' \n",
- " https://twitter.com/i/web/status/849636868052275200 \n",
- " Wednesday \n",
- " 14 \n",
- " [] \n",
- " 0 \n",
- " \n",
- " \n",
- " 848988730585096192 \n",
- " 2017-04-03 20:01:01 \n",
- " b\"@ForIn2020 @waltmossberg @mims @defcon_5 Exactly. Tesla is absurdly overvalued if based on the past, but that's irr\\xe2\\x80\\xa6 https://t.co/qQcTqkzgMl\" \n",
- " https://twitter.com/i/web/status/848988730585096192 \n",
- " Monday \n",
- " 20 \n",
- " [@ForIn2020, @waltmossberg, @mims, @defcon_5] \n",
- " 4 \n",
- " \n",
- " \n",
- " 848943072423497728 \n",
- " 2017-04-03 16:59:35 \n",
- " b'@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- " https://twitter.com/i/web/status/848943072423497728 \n",
- " Monday \n",
- " 16 \n",
- " [@waltmossberg, @mims, @defcon_5] \n",
- " 3 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " created_at \\\n",
- "id \n",
- "849636868052275200 2017-04-05 14:56:29 \n",
- "848988730585096192 2017-04-03 20:01:01 \n",
- "848943072423497728 2017-04-03 16:59:35 \n",
- "\n",
- " text \\\n",
- "id \n",
- "849636868052275200 b'And so the robots spared humanity ... https://t.co/v7JUJQWfCv' \n",
- "848988730585096192 b\"@ForIn2020 @waltmossberg @mims @defcon_5 Exactly. Tesla is absurdly overvalued if based on the past, but that's irr\\xe2\\x80\\xa6 https://t.co/qQcTqkzgMl\" \n",
- "848943072423497728 b'@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- "\n",
- " tweet_link \\\n",
- "id \n",
- "849636868052275200 https://twitter.com/i/web/status/849636868052275200 \n",
- "848988730585096192 https://twitter.com/i/web/status/848988730585096192 \n",
- "848943072423497728 https://twitter.com/i/web/status/848943072423497728 \n",
- "\n",
- " week_day day_hour \\\n",
- "id \n",
- "849636868052275200 Wednesday 14 \n",
- "848988730585096192 Monday 20 \n",
- "848943072423497728 Monday 16 \n",
- "\n",
- " tweet_mentions n_mentions \n",
- "id \n",
- "849636868052275200 [] 0 \n",
- "848988730585096192 [@ForIn2020, @waltmossberg, @mims, @defcon_5] 4 \n",
- "848943072423497728 [@waltmossberg, @mims, @defcon_5] 3 "
- ]
- },
- "execution_count": 176,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 357,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl4AAAGDCAYAAAD6aR7qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3dfZzVdZ338ffbEXUylSXRhECMDNcV00KBC6/WtgzTtlh3t5YLKtvC2u3+hlaTR+peuLZR1j62m2sly1qI8iqb3GwlM+mGC0iMckwlxRtwKKGIUJu8GT/XH78feDjMnJlz+M7vd86c1/PxmIdzvufucz4zOm9/dx9HhAAAADD8Dii7AAAAgHZB8AIAACgIwQsAAKAgBC8AAICCELwAAAAKQvACAAAoCMELaEK2r7G9uKT3tu0v2v6d7Z+UUUMttn9h+8yy66iX7bD9grLr6I/txbZ/Y/vXQ3z8pbaXDVMt59v+8XC8NtAMCF7AENh+wPY224dWrL3V9qoSyxouZ0g6S9LzIuL0MgvpL4BGxJ9FxKqSShpxbE+U9AFJJ0bEc/u5/0zbDxVfGTAyEbyAoeuQ9J6yi6iX7Y46n3KspAci4rHhqAfDy/aBdT5loqTfRsS24agHwN4IXsDQLZH0Qdujq++wPSnflXRgxdoq22/Nvz/f9mrbn7S90/Z9tv9Hvr4l35r2pqqXPdL2TbYfsf0D28dWvPYJ+X07bG+0/bqK+66x/Tnb37H9mKSX9VPvONvX58+/1/aCfP0tkj4vaabtR21f1s9z6/ostg+2/XHbm20/bPv/2O7M7zvT9kO2P5A/71e235zfd4GkeZI+lNfyX/n6A7ZfUfHan7K9Nf/6lO2DB3vt/P5zbN+Z97fH9gf7+awH55/xpIq1sbZ7bR9l+0jb384fs8P2j2zX+u/qK2zfkz/+M7adv+YBthfZfjCv9cu2j6j8HFV1VfbgUttft73M9i5J5/fzOY7IX3N7/h6L8vd8haSbJI3Le3xN1fMOlfTfFfc/antcfvdB+Ws+4mz377SK542z/Y38/e63/e6BGmL7Ofnv4i5nu7YnV93/b/nv1S7bt9n+n/n6c23/wfZzKh774vw9R9X4GQClIngBQ7de0ipJ+/yBHqLpkm6X9BxJX5H0VUmnSXqBpPmSPm372RWPnyfpf0s6UtLPJC2X9vwxvCl/jaMk/Z2kz9o+seK5/0vS5ZIOk9Tf8TJflfSQpHGS/kbSv9j+i4i4WtLbJa2JiGdHxCUJPstHJb1Q0in5/eMlfaTitZ4r6Yh8/S2SPmP7TyLiqvwzfyyv5S/7qeNiSTPy136RpNMlLRrstfP7rpb0tog4TNJJkr5f/eIR8bik6yTNrVh+naQf5FuIPqCsj2MlHS3pw5JqzWF7tbI+nZy/zux8/fz862WSni/p2ZI+XeN1qr1W0tcljVb+e1Ll35X14fmS/lzSGyW9OSK+J+lVkrbmPT6/8kn5Vs/K+58dEVvzu1+j7Oc+WtL1u+vNg+d/Sfq5sr6/XNJ7bc9W/z4j6Y+SjpH09/lXpVuV/XzHKPtd+7+2D4mIXyv79/F1FY99g6SvRsSTA7wXUL6I4Isvvgb5kvSApFco+wP9e2V/aN8qaVV+/yRlf3APrHjOKklvzb8/X9I9FfdNzR9/dMXabyWdkn9/jbI/ILvve7akPkkTJL1e0o+q6vsPSZdUPPfLNT7LhPy1DqtYu0LSNRW1/rjG84f8WSRZ0mOSJlfcN1PS/fn3Z0rqrerbNkkzKj7L4v5+Fvn3mySdU3HfbGW7SYfy2pslvU3S4YP87F8haVPF7dWS3ph//8+SviXpBUP4HQpJZ1TcvlbShfn3N0v6x4r7pkh6UtKB+ed4qEYPLpX0wxrv2yHpCWXHcO1ee5ue+d3d5/Wrnt/f+18q6XsVt0+U1Jt/P13S5qrHXyTpiwPU9qSkEyrW/mWQ37/fSXpR/v3rJa2ueK1fSzp9qP9e88VXGV9s8QLqEBF3SPq2pAsbePrDFd/35q9XvVa5xWtLxfs+KmmHsi1Ux0qanu+u2ml7p7KtY8/t77n9GCdpR0Q8UrH2oLKtE6k/y1hJz5J0W0WtN+bru/02Ip6quP0H7d2HWsblte/2YL42lNf+a0nnSHrQ2a7cmQO8xy2SnmV7uu1JygLlN/P7lki6V9J3ne1yHez3ovKswcpa+vscByrbijYUtX7eR0oa1c/r1/Pz7k/1ZznE2a72Y5Xtmqz8/fyw+v8sY5V9zsr6K+uU7Q/avsv27/PXOiL/TFIWek+0fZyyE0J+HxFNdyYuUKnegzABSJdI+qmkT1Ss7T4Q/VmSduXf73OGWJ0m7P4m3203RtJWZX+kfhARZ9V4bq3dXVsljbF9WEX4miipZz/r7c9vlIWwP4uIRl6/1ueQss9yrKRf5Lcn5muDv3DErZJemx8P9E5lW6Am9PO4PtvXKtvd+LCkb+/uW/7PD0j6QH4c2Pdt3xoRNw+lhn4+x24TJT2Vv984Zb9XkvacLDFWe6vVp98o26p0rKQ7K15/qD+PwX4G1bYo26J5/BAeu13Z55wg6e6K2iRJ+fFcH1K2u/IXEfG07d8p25KqiPhj/rOZL+kESf9ZZ61A4djiBdQpIu6V9DVJ765Y267sD9l82x22/15VBwk34BzbZ9g+SNmxXmsjYouyLW4vtP0G26Pyr9Ns/+kQ698i6f9JusL2IbZPVnb8U/LrMkXE05KWSvqk7aMkyfb4Gsf7VHtY2XFJA1khaVF+wPuRyo4dG/Rz2D7I9jzbR0R2PNAuSU/XeMpXlO3Wmpd/v/t1Xm37BflB8r9Xtgu31uvU+hzvs31cHrL/RdLX8q11v1S2NencPCQuknTwUF84IvqUhcrLbR/m7CSN92voP++HJT1n98H+Q/ATSY/Y/ifbnfm/DyfZPm2A2q6TdKntZ+XHKVaeZHKYsmC2XdKBtj8i6fCql/myst3frxHBCy2A4AU05p8lHVq1tkDSQmXHN/2ZsnCzP76ibOvaDkkvUfZ/9bu3srxS2UH1W5Xt8vlX1fHHWNnWm0n587+p7Piw7+1nvQP5J2W749bmZ919T9kxTENxtbJdSTttd/Vz/2JlJz3cLqlb2ZbIoV549g2SHshreruyUNWviFinbKvmOGVn+e12vLLP86ikNZI+GxG3DPH9K31BWWj4oaT7lR1s/q78vX8v6R+VnW3ak9dR73W13pU/7z5lJ1t8JX/PQUXE3cqC4X35z2HcII/vU3YSwSn5Z/lNXvtAwe2dyna5/lrZMX1frLhvpbJd079Utgvyj6rarRoRq5WF3Z9GxF67KYFm5Ih6tyIDANA8bH9f0lci4vNl1wIMhuAFAGhZ+S7MmyRNqDphBGhK7GoEALQk219Stqv3vYQutAq2eAEAABSELV4AAAAFIXgBAAAUpCUuoHrkkUfGpEmTyi4DAABgULfddttvIqL6QseSWiR4TZo0SevXry+7DAAAgEHZHvCacuxqBAAAKAjBCwAAoCAELwAAgIIQvAAAAApC8AIAACgIwQsAAKAgBC8AAICCELwAAAAKQvACAAAoCMELAACgIC0xMmi4Tbrwhn3WHvjouSVUMjLQz/ToaVrTL79JDz/yxJ7bRx92kNZdfFaJFQFoF22/xau/P2i11lEb/UyPnqZVHbok6eFHntD0y28qqSIA7aTtgxeA9lIdugZbB4CUCF4AAAAFIXgBAAAUhOAFoK0cfdhBda0DQEptH7wGOjOMM8YaQz/To6dprbv4rH1CFmc1AiiKI6LsGgY1bdq0WL9+fdllAAAADMr2bRExrb/72n6LFwAAQFEIXgAAAAUheAEAABSE4AUAAFAQZjWKOXip0c/06GlaZ125Svdse2zP7eOPOlQ3vf/M8gpqcV0berRk5UZt3dmrcaM7tXD2FM05dXzZZQFNqe23eDEHLy36mR49Tas6dEnSPdse01lXriqnoBbXtaFHF13XrZ6dvQpJPTt7ddF13era0FN2aUBTavvgBaC9VIeuwdZR25KVG9X7ZN9ea71P9mnJyo0lVQQ0N4IXAKBhW3f21rUOtDuCFwCgYeNGd9a1DrQ7gheAtnL8UYfWtY7aFs6eos5RHXutdY7q0MLZU0qqCGhubR+8mIOXFv1Mj56mddP7z9wnZHFWY+PmnDpeV5w3VeNHd8qSxo/u1BXnTeWsRmAAzGoEAABIiFmNAAAATYDgBQAAUBCCFwAAQEEIXgAAAAUheAEAABSEIdliAHFq9DM9eprWyZfcqF2PPzPm5vCDO3T7ZWeXWFFrm7d0jVZv2rHn9qzJY7R8wcwSK2p9i7q6tWLdFvVFqMPW3OkTtHjO1LLLQgJtv8WLAcRp0c/06Gla1aFLknY93qeTL7mxpIpaW3XokqTVm3Zo3tI1JVXU+hZ1dWvZ2s3qyy/31BehZWs3a1FXd8mVIYW2D14A2kt16BpsHbVVh67B1jG4Feu21LWO1kLwAgCgifQNcGHzgdbRWgheAAA0kQ67rnW0FoIXgLZy+MEdda2jtlmTx9S1jsHNnT6hrnW0lrYPXgwgTot+pkdP07r9srP3CVmc1di45Qtm7hOyOKtx/yyeM1XzZ0zcs4Wrw9b8GRM5q3GEYEg2AABAQgzJBgAAaAIELwAAgIIQvAAAAApC8AIAACgIwQsAAKAgwzYk2/YESV+WdLSkkHRVRPyb7TGSviZpkqQHJL0uIn43XHUMBQOI06Kf6dHTtF5w0Q16quKE7gMt3XsF/WwUA53R7Lo29GjJyo3aurNX40Z3auHsKZpz6vhSahnOLV5PSfpARJwoaYakd9g+UdKFkm6OiOMl3ZzfLg0DiNOin+nR07SqQ5ckPRXZOurHQGc0u64NPbroum717OxVSOrZ2auLrutW14aeUuoZtuAVEb+KiJ/m3z8i6S5J4yW9VtKX8od9SdKc4aoBAKpVh67B1lEbA53R7Jas3KjeJ/v2Wut9sk9LVm4spZ5CjvGyPUnSqZLWSTo6In6V3/VrZbsi+3vOBbbX216/ffv2IsoEANSJgc5odlt39ta1PtyGPXjZfrakb0h6b0Tsqrwvssvm9/tvZ0RcFRHTImLa2LFjh7tMAEADGOiMZjdudGdd68NtWIOX7VHKQtfyiLguX37Y9jH5/cdI2jacNQBApQMHyAMDraM2Bjqj2S2cPUWdo/aez9o5qkMLZ08ppZ5hC162LelqSXdFxJUVd10v6U3592+S9K3hqmEoGECcFv1Mj56mde8V5+4TsjirsXEMdEazm3PqeF1x3lSNH90pSxo/ulNXnDe1tLMah21Itu0zJP1IUrekp/PlDys7zutaSRMlPajschI7ar0WQ7IBAECrqDUke9iu4xURP5Y00Mb7lw/X+wIAADQrrlwPAABQEIIXAABAQQheAAAABSF4AQAAFGTYDq5vJQwgTot+pkdP05q3dI1Wb3rmZOpZk8do+YKZJVbU2pppADHQ7Np+ixcDiNOin+nR07SqQ5ckrd60Q/OWrimpotbWbAOIgWbX9sELQHupDl2DraO2ZhtADDQ7ghcAoGHNNoAYaHYELwBAw5ptADHQ7AheANrKrMlj6lpHbc02gBhodm0fvBhAnBb9TI+eprV8wcx9QhZnNTau2QYQA81u2IZkp8SQbAAA0CpqDclu+y1eAAAARSF4AQAAFITgBQAAUBCCFwAAQEEIXgAAAAVhSLYYQJwa/UyPnqZ18iU3atfjz4y5OfzgDt1+2dklVtTaGJINDF3bb/FiAHFa9DM9eppWdeiSpF2P9+nkS24sqaLWxpBsoD5tH7wAtJfq0DXYOmpjSDZQH4IXAKBhDMkG6kPwAgA0jCHZQH0IXgDayuEHd9S1jtoYkg3Up+2DFwOI06Kf6dHTtG6/7Ox9QhZnNTaOIdlAfRiSDQAAkBBDsgEAAJoAwQsAAKAgBC8AAICCELwAAAAKQvACAAAoCEOyxQDi1OhnevQ0rRMu/o7+2PfMGd2HdFh3X35OiRW1tkVd3Vqxbov6ItRha+70CVo8Z2rZZQFNqe23eDGAOC36mR49Tas6dEnSH/tCJ1z8nZIqam2Lurq1bO1m9eWXJuqL0LK1m7Woq7vkyoDm1PbBC0B7qQ5dg62jthXrttS1DrQ7ghcAoGF9A1yEe6B1oN0RvAAADeuw61oH2h3BC0BbOaSj/0Aw0Dpqmzt9Ql3rQLtr++DFAOK06Gd69DStuy8/Z5+QxVmNjVs8Z6rmz5i4ZwtXh635MyZyViMwAIZkAwAAJMSQbAAAgCZA8AIAACgIwQsAAKAgBC8AAICCELwAAAAKwpBsMYA4NfqZHj1N66wrV+mebY/tuX38UYfqpvefWV5BLW7e0jVavWnHntuzJo/R8gUzS6wIaF5tv8WLAcRp0c/06Gla1aFLku7Z9pjOunJVOQW1uOrQJUmrN+3QvKVrSqoIaG5tH7wAtJfq0DXYOmqrDl2DrQPtjuAFAABQEIIXAABAQQheANrK8UcdWtc6aps1eUxd60C7a/vgxQDitOhnevQ0rZvef+Y+IYuzGhu3fMHMfUIWZzUCA2NINgAAQEIMyQYAAGgCBC8AAICCELwAAAAKQvACAAAoyLAFL9tfsL3N9h0Va5fa7rH9s/zrnOF6fwAAgGYznEOyr5H0aUlfrlr/ZER8fBjft24MIE6LfqZHT9M6+ZIbtevxvj23Dz+4Q7dfdnaJFQF7W9TVrRXrtqgvQh225k6foMVzppZdFhIYti1eEfFDSU0/rIsBxGnRz/ToaVrVoUuSdj3ep5MvubGkioC9Lerq1rK1m9WXX+6pL0LL1m7Woq7ukitDCmUc4/VO27fnuyL/pIT3B9DGqkPXYOtA0Vas21LXOlpL0cHrc5ImSzpF0q8kfWKgB9q+wPZ62+u3b99eVH0AAJSqb4ALmw+0jtZSaPCKiIcjoi8inpa0VNLpNR57VURMi4hpY8eOLa5IAABK1GHXtY7WUmjwsn1Mxc2/knTHQI8FgOFw+MEdda0DRZs7fUJd62gtw3k5iRWS1kiaYvsh22+R9DHb3bZvl/QySe8brvcfKgYQp0U/06Onad1+2dn7hCzOakQzWTxnqubPmLhnC1eHrfkzJnJW4wjBkGwAAICEGJINAADQBAheAAAABSF4AQAAFITgBQAAUBCCFwAAQEGGc0h2y2AAcVr0M70XXHSDnqo4AflAS/deQU8bxQDitOYtXaPVm54ZzTtr8hgtXzCzxIqA5tX2W7wYQJwW/UyvOnRJ0lORraN+DCBOqzp0SdLqTTs0b+makioCmlvbBy+g2VWHrsHWURsDiNOqDl2DrQPtjuAFoK0wgBhAmQheANoKA4gBlIngBTS5AwfIAwOtozYGEKc1a/KYutaBdtf2wYsBxGnRz/TuveLcfUIWZzU2jgHEaS1fMHOfkMVZjcDAGJINAACQEEOyAQAAmgDBCwAAoCAELwAAgIIQvAAAAArCrEYxWzA15gqm17WhR0tWbtTWnb0aN7pTC2dP0ZxTx5ddFiCJ30+gHjW3eNnusL28qGLKwGzBtJgrmF7Xhh5ddF23enb2KiT17OzVRdd1q2tDT9mlAfx+AnWqGbwiok/SsbYPKqgetDjmCqa3ZOVG9T7Zt9da75N9WrJyY0kVAc/g9xOoz1B2Nd4nabXt6yU9tnsxIq4ctqoA7LF1Z29d60CR+P0E6jOUg+s3Sfp2/tjDKr4AFGDc6M661oEi8fsJ1GfQLV4RcVkRhWBkOND971ZkrmDjFs6eoouu695rd07nqA4tnD2lxKqADL+fQH0G3eJl+xbb36/+KqK4IjBbMC3mCqY359TxuuK8qRo/ulOWNH50p644bypnjaEp8PsJ1GfQWY22X1Jx8xBJfy3pqYj40HAWVolZjQAAoFXUmtU4lF2Nt1Utrbb9kySVAQAAtJFBg5ftMRU3D5D0EklHDFtFAAAAI9RQLidxm6SQZElPSbpf0luGsygAAICRaCi7Go8rohAAAICRbii7GkdJ+gdJL82XVkn6j4h4chjrAgAAGHGGsqvxc5JGSfpsfvsN+dpbh6uoojEkO62TL7lRux5/5po+hx/codsvO7vEilofQ4gBYGQYypXrT4uIN0XE9/OvN0s6bbgLKwpDstOqDl2StOvxPp18yY0lVdT6GEIMACPHUIJXn+3Ju2/Yfr6kvhqPRxurDl2DrWNwDCEGgJFjKLsaF0q6xfZ9ys5sPFbSm4e1KgB7MIQYAEaOoZzVeLPt4yXtHry1MSIeH96yAOw2bnSnevoJWQwhBoDWM5RdjVJ20dSTJJ0i6fW23zh8JaGVHX5wR13rGNzC2VPUOWrv/jGEGABa01CGZP+npI9LOkPZQfWnSep3/lArYkh2WrdfdvY+IYuzGvcPQ4gBYOQYypDsuySdGIM9cBgxJBsAALSKWkOyh7Kr8Q5Jz01bEgAAQPsZ8OB62/+lbEbjYZLutP0TSXsOqo+I1wx/eQAAACNHrbMaP15YFQAAAG1gwOAVET8oshAAAICRbqiXkwAAAMB+GsqV60c8hjqnRT/Tm7d0jVZv2rHn9qzJY7R8wcwSKwIANGIo1/H6S9sjdssYQ53Top/pVYcuSVq9aYfmLV1TUkUAgEYNJVC9XtI9tj9m+4ThLqhoDHVOi36mVx26BlsHADSvQYNXRMyXdKqkTZKusb3G9gW2Dxv26gAAAEaQIe1CjIhdkr4u6auSjpH0V5J+avtdw1gbAADAiDKUY7xeY/ubklZJGiXp9Ih4laQXSfrA8JY3/BjqnBb9TG/W5DF1rQMAmtdQtnj9taRPRsTUiFgSEdskKSL+IOktw1pdARjqnBb9TG/5gpn7hCzOagSA1jTokOxmwJBsAADQKvZrSLbtGbZvtf2o7Sds99nelb5MAACAkW0ouxo/LWmupHskdUp6q6TPDGdRAAAAI9FQz2q8V1JHRPRFxBclccAOAABAnYYyMugPtg+S9DPbH5P0KzHjEQAAoG5DCVBvyB/3TkmPSZqg7ExHAAAA1GHQLV4R8aDtsfn3lw31hW1/QdKrJW2LiJPytTGSviZpkqQHJL0uIn5Xf9lpTbrwhn3WHvjouSVUMjLQz/QWdXVrxbot6otQh6250ydo8ZypZZcFAKjTgFu8nLnU9m8kbZT0S9vbbX9kiK99jfY9FuxCSTdHxPGSbs5vl6q/kFBrHbXRz/QWdXVr2drN6ssv/dIXoWVrN2tRV3fJlQEA6lVrV+P7JM2SdFpEjImIP5E0XdIs2+8b7IUj4oeSqqf4vlbSl/LvvyRpTv0lA+1lxbotda0DAJpXreD1BklzI+L+3QsRcZ+k+ZLe2OD7HR0Rv8q//7Wkowd6YD6Ie73t9du3b2/w7YDW1zfARY4HWgcANK9awWtURPymejEitiub2bhfIrtk/oB/OSLiqoiYFhHTxo4du79vB7SsDruudQBA86oVvJ5o8L5aHrZ9jCTl/9zW4OsAbWPu9Al1rQMAmlet4PUi27v6+XpEUqOnU10v6U3592+S9K0GXyeZgc624yy8xtDP9BbPmar5Mybu2cLVYWv+jImc1QgALWjYhmTbXiHpTElHSnpY0iWSuiRdK2mipAeVXU6i+gD8fTAkGwAAtIpaQ7KHcuX6hkTE3AHuevlwvScAAEAzY/QPAABAQQheAAAABSF4AQAAFITgBQAAUJBhO7i+lTDUOS36md5xF96w19WGLel+egoALaftt3gx1Dkt+pledeiSspEPx9FTAGg5bR+8gGY30JX2mNQIAK2H4AUAAFAQghcAAEBBCF5Ak3Od6wCA5tX2wYuhzmnRz/Tu/+i5+4QszmoEgNY0bEOyU2JINgAAaBW1hmS3/RYvAACAohC8AAAACkLwAgAAKAjBCwAAoCAELwAAgIIwJFsMdU6NfqZ3wsXf0R/7njkD+ZAO6+7LzymxotY2b+kard60Y8/tWZPHaPmCmSVW1NoWdXVrxbot6otQh6250ydo8ZypZZcFNKW23+LFUOe06Gd61aFLkv7YFzrh4u+UVFFrqw5dkrR60w7NW7qmpIpa26Kubi1bu1l9+aWJ+iK0bO1mLerqLrkyoDm1ffACml116BpsHbVVh67B1lHbinVb6loH2h3BCwDQsL4BLsI90DrQ7gheAICGdbj/qaEDrQPtjuAFNLlDOvr/AzbQOmqbNXlMXeuobe70CXWtA+2u7YMXQ53Top/p3X35OfuELM5qbNzyBTP3CVmc1di4xXOmav6MiXu2cHXYmj9jImc1AgNgSDYAAEBCDMkGAABoAgQvAACAghC8AAAACkLwAgAAKAjBCwAAoCAMyRZDnVOjn+nR07S6NvRoycqN2rqzV+NGd2rh7Cmac+r4sstqWfQTGLq23+LFUOe06Gd69DStrg09uui6bvXs7FVI6tnZq4uu61bXhp6yS2tJ9BOoT9sHLwDtZcnKjep9sm+vtd4n+7Rk5caSKmpt9BOoD8ELQFvZurO3rnXURj+B+hC8ALSVcaM761pHbfQTqA/BC0BbWTh7ijpHdey11jmqQwtnTympotZGP4H6tH3wYqhzWvQzPXqa1pxTx+uK86Zq/OhOWdL40Z264rypnIXXIPoJ1Ich2QAAAAkxJBsAAKAJELwAAAAKQvACAAAoCMELAACgIAQvAACAgjAkWwwgTo1+pnfchTeo8vxjS7qfnjZsUVe3Vqzbor4IddiaO32CFs+ZWnZZLYt+AkPX9lu8GECcFv1Mrzp0SVLk66jfoq5uLVu7WX35pXT6IrRs7WYt6uouubLWRD+B+rR98AKa3UBX2mv+K/A1pxXrttS1jtroJ1AfgheAttI3wEWjB1pHbfQTqA/BC0Bb6bDrWkdt9BOoD8ELaHID/fniz1pj5k6fUNc6aqOfQH3aPngxgDgt+pne/R89d5+QxVmNjVs8Z6rmz5i4Z4tMh635MyZyFl6D6CdQH4ZkAwAAJMSQbAAAgCZA8AIAACgIwQsAAKAgBC8AAICClDKr0fYDkh6R1CfpqYEOQCsKswXTop/pnXzJjdr1eN+e24cf3KHbLzu7xIpaG7MFAZSlzC1eL4uIU5oxdNVaR230M73q0CVJux7v08mX3FhSRa2N2YIAysSuRqDJVYeuwdZRG7MFAZSprOAVkr5r+1jI5tQAAA0+SURBVDbbF/T3ANsX2F5ve/327dsLLg/ASMVsQQBlKit4nRERL5b0KknvsP3S6gdExFURMS0ipo0dO7b4CgGMSMwWBFCmUoJXRPTk/9wm6ZuSTi+jDqAVHH5wR13rqI3ZggDKVHjwsn2o7cN2fy/plZLuKLqO3ZgtmBb9TO/2y87eJ2RxVmPjmC0IoEyFz2q0/XxlW7mk7HIWX4mIy2s9h1mNAACgVdSa1Vj4dbwi4j5JLyr6fQEAAMrG5SQAAAAKQvACAAAoCMELAACgIAQvAACAgpQyJLvZMNQ5LfqZHkOy02JIdlr0M72uDT1asnKjtu7s1bjRnVo4e4rmnDq+7LJaVjP1s+23eDHUOS36mR5DstNiSHZa9DO9rg09uui6bvXs7FVI6tnZq4uu61bXhp6yS2tJzdbPtg9eQLNjSHZaDMlOi36mt2TlRvU+ufe/371P9mnJyo0lVdTamq2fBC8AbYUh2WnRz/S27uytax21NVs/CV4A2gpDstOin+mNG91Z1zpqa7Z+EryAJseQ7LQYkp0W/Uxv4ewp6hy197/fnaM6tHD2lJIqam3N1s+2D14MdU6LfqbHkOy0GJKdFv1Mb86p43XFeVM1fnSnLGn86E5dcd5UzmpsULP1s/Ah2Y1gSDYAAGgVtYZkt/0WLwAAgKIQvAAAAApC8AIAACgIwQsAAKAgBC8AAICCMCRbDHVOjX6mR0/Tmrd0jVZv2rHn9qzJY7R8wcwSK2ptzTSAeKSgpyNX22/xYqhzWvQzPXqaVnXokqTVm3Zo3tI1JVXU2pptAPFIQE9HtrYPXgDaS3XoGmwdtTXbAOKRgJ6ObAQvAEDDmm0A8UhAT0c2ghcAoGHNNoB4JKCnIxvBC0BbmTV5TF3rqK3ZBhCPBPR0ZGv74MVQ57ToZ3r0NK3lC2buE7I4q7FxzTaAeCSgpyMbQ7IBAAASYkg2AABAEyB4AQAAFITgBQAAUBCCFwAAQEEIXgAAAAVhSLYYQJwa/UyPnqY1/fKb9PAjT+y5ffRhB2ndxWeVWFFrY6AzMHRtv8WLAcRp0c/06Gla1aFLkh5+5AlNv/ymkipqbQx0BurT9sELQHupDl2DraM2BjoD9SF4AQAaxkBnoD4ELwBAwxjoDNSH4AWgrRx92EF1raM2BjoD9Wn74MUA4rToZ3r0NK11F5+1T8jirMbGMdAZqA9DsgEAABJiSDYAAEATIHgBAAAUhOAFAABQEIIXAABAQQheAAAABWFIthhAnBr9TI+epsWQ7LQWdXVrxbot6otQh6250ydo8ZypZZcFNKW23+LFAOK06Gd69DQthmSntairW8vWblZffmmivggtW7tZi7q6S64MaE5tH7wAtBeGZKe1Yt2WutaBdkfwAgA0rG+Ai3APtA60O4IXAKBhHXZd60C7I3gBaCsMyU5r7vQJda0D7a7tgxcDiNOin+nR07QYkp3W4jlTNX/GxD1buDpszZ8xkbMagQEwJBsAACAhhmQDAAA0AYIXAABAQQheAAAABSF4AQAAFKSU4GX7bNsbbd9r+8IyagAAACha4UOybXdI+oyksyQ9JOlW29dHxJ1F17IbA4jTop/p0dO0zrpyle7Z9tie28cfdahuev+Z5RUEoG2UscXrdEn3RsR9EfGEpK9Kem0JdUhiAHFq9DM9eppWdeiSpHu2PaazrlxVTkEA2koZwWu8pMrpqQ/lawAw7KpD12DrAJBS0x5cb/sC2+ttr9++fXvZ5QAAAOy3MoJXj6TKIV7Py9f2EhFXRcS0iJg2duzYwooDAAAYLmUEr1slHW/7ONsHSfo7SdeXUAeANnT8UYfWtQ4AKRUevCLiKUnvlLRS0l2Sro2IXxRdx24MIE6LfqZHT9O66f1n7hOyOKsRQFEYkg0AAJAQQ7IBAACaAMELAACgIAQvAACAghC8AAAACkLwAgAAKAjBCwAAoCAELwAAgIIQvAAAAApC8AIAACgIwQsAAKAgLTEyyPZ2SQ8W8FZHSvpNAe/TLuhnevQ0LfqZFv1Mj56mVVQ/j42Isf3d0RLBqyi21w80Wwn1o5/p0dO06Gda9DM9eppWM/STXY0AAAAFIXgBAAAUhOC1t6vKLmCEoZ/p0dO06Gda9DM9eppW6f3kGC8AAICCsMULAACgIAQvSbbPtr3R9r22Lyy7nlZn+wu2t9m+o+xaRgLbE2zfYvtO27+w/Z6ya2p1tg+x/RPbP897elnZNY0Etjtsb7D97bJraXW2H7DdbftntteXXc9IYHu07a/bvtv2XbZnllJHu+9qtN0h6ZeSzpL0kKRbJc2NiDtLLayF2X6ppEclfTkiTiq7nlZn+xhJx0TET20fJuk2SXP4HW2cbUs6NCIetT1K0o8lvSci1pZcWkuz/X5J0yQdHhGvLrueVmb7AUnTIoJreCVi+0uSfhQRn7d9kKRnRcTOoutgi5d0uqR7I+K+iHhC0lclvbbkmlpaRPxQ0o6y6xgpIuJXEfHT/PtHJN0laXy5VbW2yDya3xyVf7X3/4XuJ9vPk3SupM+XXQtQzfYRkl4q6WpJiognyghdEsFLyv6Abam4/ZD4o4YmZXuSpFMlrSu3ktaX7xb7maRtkm6KCHq6fz4l6UOSni67kBEiJH3X9m22Lyi7mBHgOEnbJX0x3x3+eduHllEIwQtoEbafLekbkt4bEbvKrqfVRURfRJwi6XmSTrfNbvEG2X61pG0RcVvZtYwgZ0TEiyW9StI78kM40LgDJb1Y0uci4lRJj0kq5ZhugpfUI2lCxe3n5WtA08iPQ/qGpOURcV3Z9Ywk+e6GWySdXXYtLWyWpNfkxyV9VdJf2F5WbkmtLSJ68n9uk/RNZYfFoHEPSXqoYsv215UFscIRvLKD6Y+3fVx+sN3fSbq+5JqAPfIDwa+WdFdEXFl2PSOB7bG2R+ffdyo7uebucqtqXRFxUUQ8LyImKftv6PcjYn7JZbUs24fmJ9Io3x32SkmcJb4fIuLXkrbYnpIvvVxSKScoHVjGmzaTiHjK9jslrZTUIekLEfGLkstqabZXSDpT0pG2H5J0SURcXW5VLW2WpDdI6s6PSZKkD0fEd0qsqdUdI+lL+VnNB0i6NiK4BAKaxdGSvpn9P5cOlPSViLix3JJGhHdJWp5vZLlP0pvLKKLtLycBAABQFHY1AgAAFITgBQAAUBCCFwAAQEEIXgAAAAUheAEAABSE4AWgbrbD9icqbn/Q9qWJXvsa23+T4rUGeZ+/tX2X7VuG+73y9xtt+x8rbo+z/fUi3htA8yB4AWjE45LOs31k2YVUsl3PtQnfImlBRLxsuOqpMlrSnuAVEVsjYtgDJoDmQvAC0IinJF0l6X3Vd1RvsbL9aP7PM23/wPa3bN9n+6O259n+ie1u25MrXuYVttfb/mU+B3D3UOsltm+1fbvtt1W87o9sX69+rkRte27++nfY/td87SOSzpB0te0lVY8fUp351e+/kddzq+1Z+fqltr9ge1X+/HfnL/1RSZNt/yz/HJNs35E/5xDbX8xff4Ptl+Xr59u+zvaNtu+x/bGKXlyTf6Zu2/v8HAA0p7a/cj2Ahn1G0u27w8AQvUjSn0raoezK0Z+PiNNtv0fZVaXfmz9ukrLZdJMl3WL7BZLeKOn3EXGa7YMlrbb93fzxL5Z0UkTcX/lmtsdJ+ldJL5H0O0nftT0nIv7Z9l9I+mBErG+wzn+T9MmI+LHticqmX/xp/vwTJL1M0mGSNtr+nLKBvCflg7lle1LF+71DUkTEVNsn5HW+ML/vFEmnKtvKuNH2v0s6StL4iDgpf63RNXoOoIkQvAA0JCJ22f6ypHdL6h3i026NiF9Jku1NknYHp25lQWW3ayPiaUn32L5PWZB5paSTK7amHSHpeElPSPpJdejKnSZpVURsz99zuaSXSupKUOcrJJ2Yj3WRpMNtPzv//oaIeFzS47a3KRsBU8sZkv5dkiLibtsPStodvG6OiN/ntdwp6VhJv5D0/DyE3VBRH4AmR/ACsD8+Jemnkr5YsfaU8sMYbB8g6aCK+x6v+P7pittPa+//HlXPMgtJlvSuiFhZeYftMyU91lj5AxpKnQdImhERf6yqp/r5fdq//9bu81oR8TvbL5I0W9LbJb1O0t/vx3sAKAjHeAFoWETskHStsgPVd3tA2a49SXqNpFENvPTf2j4gP57q+ZI2KtuV9w+2R0mS7RfaPnSQ1/mJpD+3fWQ+EHuupB80UE9/vqtst6Pyek4Z5PGPKNv12J8fSZqXv84LJU1U9pn7lZ/UcEBEfEPSImW7WgG0AIIXgP31CUmVZzcuVRZ2fi5pphrbGrVZWWj6b0lvz7cqfV7ZwfM/zQ9K/w8NsiUp3114oaRbJP1c0m0R8a0G6unPuyVNyw/0v1PZlqdatfxW2XFpd1Qf0C/ps5IOsN0t6WuSzs93VQ5kvKRVtn8maZmkixr+FAAK5YjqLfoAAAAYDmzxAgAAKAjBCwAAoCAELwAAgIIQvAAAAApC8AIAACgIwQsAAKAgBC8AAICCELwAAAAK8v8BGAnb7ptrn1kAAAAASUVORK5CYII=",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "%matplotlib inline\n",
- "plt.figure(figsize=(10, 6))\n",
- "\n",
- "# specify the type of plot and the two\n",
- "# variables to be plotted against one another\n",
- "plt.scatter(df.n_mentions, df.day_hour)\n",
- "\n",
- "# give a title to the plot\n",
- "plt.title('Number of mentions vs hour of the day')\n",
- "\n",
- "# give a label to the axes\n",
- "plt.ylabel(\"Day hour\")\n",
- "plt.xlabel(\"Number of mentions\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Bar charts\n",
- "\n",
- "They are useful to plot categorical data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 170,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.bar?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 343,
- "metadata": {},
- "outputs": [],
- "source": [
- "tweets_by_weekday = df.groupby(df.created_at.dt.weekday)[['text']].count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 344,
- "metadata": {},
- "outputs": [],
- "source": [
- "week_days = [\n",
- " \"Mon\",\n",
- " \"Tue\",\n",
- " \"Wed\",\n",
- " \"Thur\",\n",
- " \"Fri\",\n",
- " \"Sat\",\n",
- " \"Sun\"\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 360,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "%matplotlib inline\n",
- "plt.figure(figsize=(8, 6))\n",
- "\n",
- "# specify the type of plot and the labels\n",
- "# for the y axis (the bars)\n",
- "plt.bar(\n",
- " tweets_by_weekday.index,\n",
- " tweets_by_weekday.text,\n",
- " tick_label=week_days,\n",
- " width=0.5\n",
- ")\n",
- "\n",
- "# give a title to the plot\n",
- "plt.title('Elon Musk\\'s week on Twitter')\n",
- "\n",
- "# give a label to the axes\n",
- "plt.ylabel(\"Number of tweets\")\n",
- "plt.xlabel(\"Week day\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Box plots\n",
- "\n",
- "![box plot explained](./figures/eda-boxplot.png)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 346,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " text \n",
- " \n",
- " \n",
- " created_at \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 315 \n",
- " \n",
- " \n",
- " 1 \n",
- " 385 \n",
- " \n",
- " \n",
- " 2 \n",
- " 380 \n",
- " \n",
- " \n",
- " 3 \n",
- " 361 \n",
- " \n",
- " \n",
- " 4 \n",
- " 530 \n",
- " \n",
- " \n",
- " 5 \n",
- " 426 \n",
- " \n",
- " \n",
- " 6 \n",
- " 422 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " text\n",
- "created_at \n",
- "0 315\n",
- "1 385\n",
- "2 380\n",
- "3 361\n",
- "4 530\n",
- "5 426\n",
- "6 422"
- ]
- },
- "execution_count": 346,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tweets_by_weekday"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 347,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " text \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " count \n",
- " 7.000000 \n",
- " \n",
- " \n",
- " mean \n",
- " 402.714286 \n",
- " \n",
- " \n",
- " std \n",
- " 67.551744 \n",
- " \n",
- " \n",
- " min \n",
- " 315.000000 \n",
- " \n",
- " \n",
- " 25% \n",
- " 370.500000 \n",
- " \n",
- " \n",
- " 50% \n",
- " 385.000000 \n",
- " \n",
- " \n",
- " 75% \n",
- " 424.000000 \n",
- " \n",
- " \n",
- " max \n",
- " 530.000000 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " text\n",
- "count 7.000000\n",
- "mean 402.714286\n",
- "std 67.551744\n",
- "min 315.000000\n",
- "25% 370.500000\n",
- "50% 385.000000\n",
- "75% 424.000000\n",
- "max 530.000000"
- ]
- },
- "execution_count": 347,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tweets_by_weekday.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 349,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 349,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAMw0lEQVR4nO3db4hl9XnA8e+TXa1iNm6rZpBdyZTGFwNbYmVqhGxhdqUS/xB9kZbYBsUObNOKDQ01rixNUTpE2xemQhGXDHGlcdsQkCxqbILOJd22prq4WsMEuiSKu6y1SrJkNBGVpy/mtzCuszv37sydO/PM9wOXued3zpnzu3D3u4cz909kJpKkWj406AlIkpaecZekgoy7JBVk3CWpIOMuSQWtH/QEAM4///wcHh4e9DSkD3jzzTc555xzBj0NaV4HDhx4PTMvmG/dioj78PAwzz777KCnIX1Ap9NhbGxs0NOQ5hURL59snZdlJKkg4y5JBRl3SSrIuEtSQcZdkgoy7tI89u7dy5YtW7jiiivYsmULe/fuHfSUpJ6siJdCSivJ3r172bVrF5OTk7z33nusW7eO8fFxAG644YYBz07qjmfu0gkmJiaYnJxk27ZtrF+/nm3btjE5OcnExMSgpyZ1zbhLJ5ienmbr1q3vG9u6dSvT09MDmpHUO+MunWBkZIT9+/e/b2z//v2MjIwMaEZS74y7dIJdu3YxPj7O1NQU7777LlNTU4yPj7Nr165BT03qmn9QlU5w/I+mt956K9PT04yMjDAxMeEfU7WqxEr4DtXR0dH0g8O0EvnBYVrJIuJAZo7Ot87LMpJUkHGXpIKMuyQVZNwlqSDjLkkFGXdJKsi4S1JBxl2SCjLuklSQcZekgoy7JBVk3CWpIOMuSQUZd0kqyLhLUkHGXZIKMu6SVJBxl6SCjLskFWTcJamgruIeES9FxH9HxMGIeLaN/UZEfD8i/qf9/PU2HhFxX0QciogXIuLSfj4ASdIH9XLmvi0zL5nzTds7gScz82LgybYMcBVwcbvtAO5fqslKkrqzmMsy1wF72v09wPVzxh/KWU8DGyPiwkUcR5LUo/VdbpfA9yIigQcyczcwlJlH2/pXgaF2fxPwypx9D7exo3PGiIgdzJ7ZMzQ0RKfTOa0HIPXTzMyMz02tSt3GfWtmHomIjwLfj4gfz12ZmdnC37X2H8RugNHR0RwbG+tld2lZdDodfG5qNerqskxmHmk/XwMeAS4D/vf45Zb287W2+RHgojm7b25jkqRlsmDcI+KciNhw/D5wJfAisA+4qW12E/Cddn8fcGN71czlwLE5l28kScugm8syQ8AjEXF8+4cz84mIeAb4VkSMAy8Df9i2fxy4GjgEvAXcvOSzliSd0oJxz8yfAJ+YZ/wN4Ip5xhO4ZUlmJ0k6Lb5DVZIKMu6SVJBxl6SCjLskFWTcJakg4y5JBRl3SSrIuEtSQcZdkgoy7pJUkHGXpIKMuyQVZNwlqSDjLkkFGXdJKsi4S1JBxl2SCjLuklSQcZekgoy7JBVk3CWpIOMuSQUZd0kqyLhLUkHGXZIKMu6SVJBxl6SCjLskFWTcJakg4y5JBRl3SSrIuEtSQcZdkgoy7pJUkHGXpIKMuyQVZNwlqaCu4x4R6yLiuYh4tC0/GBE/jYiD7XZJG4+IuC8iDkXECxFxab8mL0ma3/oetv0iMA18ZM7YbZn57RO2uwq4uN0+CdzffkqSlklXZ+4RsRm4Bvh6F5tfBzyUs54GNkbEhYuYoySpR92euX8N+DKw4YTxiYj4CvAksDMz3wY2Aa/M2eZwGzs6d8eI2AHsABgaGqLT6fQ8eanfZmZmfG5qVVow7hFxLfBaZh6IiLE5q+4AXgXOBHYDtwN3dXvgzNzd9mN0dDTHxsZOvYM0AJ1OB5+bWo26uSzzKeAzEfES8M/A9oj4p8w82i69vA18A7isbX8EuGjO/pvbmCRpmSwY98y8IzM3Z+Yw8Dngqcz8/PHr6BERwPXAi22XfcCN7VUzlwPHMvPofL9bktQfvbxa5kTfjIgLgAAOAl9o448DVwOHgLeAmxc1Q0lSz3qKe2Z2gE67v/0k2yRwy2InJkk6fb5DVZIKMu6SVJBxl6SCjLskFWTcJakg4y5JBRl3SSrIuEtSQcZdkgoy7pJUkHGXpIKMuyQVZNwlqSDjLkkFGXdJKsi4S1JBi/kmJmnVmf1WyP6b/c4aaXA8c9eakpk93T52+6M972PYtRJ45q5V6xN3fo9jv3yn78cZ3vlY349x7tln8PzfXNn342jtMO5atY798h1euvuavh6j0+kwNjbW12PA8vwHorXFyzKSVJBxl6SCjLskFWTcJakg4y5JBRl3SSrIuEtSQcZdkgoy7pJUkHGXpIKMuyQVZNwlqSDjLkkFGXdJKsi4S1JBxl2SCjLuklRQ13GPiHUR8VxEPNqWfzMifhgRhyLiXyLizDb+a235UFs/3J+pS5JOppcz9y8C03OW7wHuzcyPAz8Dxtv4OPCzNn5v206StIy6intEbAauAb7elgPYDny7bbIHuL7dv64t09Zf0baXJC2Tbr8g+2vAl4ENbfk84OeZ+W5bPgxsavc3Aa8AZOa7EXGsbf/63F8YETuAHQBDQ0N0Op3TfAhay/r9vJmZmVm256b/BrSUFox7RFwLvJaZByJibKkOnJm7gd0Ao6OjuRzfMK9inniMfj9vOp1O348BLMtj0drSzZn7p4DPRMTVwFnAR4B/ADZGxPp29r4ZONK2PwJcBByOiPXAucAbSz5zSdJJLXjNPTPvyMzNmTkMfA54KjP/GJgCPts2uwn4Tru/ry3T1j+Vmbmks5YkndJiXud+O/CliDjE7DX1yTY+CZzXxr8E7FzcFCVJver2D6oAZGYH6LT7PwEum2ebXwF/sARzkySdJt+hKkkFGXdJKsi4S1JBxl2SCjLuklSQcZekgoy7JBVk3CWpoJ7exCStJBtGdvLbe5bhDdB7Ft5ksTaMwOynaktLw7hr1frF9N28dHd/g7hcnwo5vPOxvh9Da4uXZSSpIOMuSQUZd0kqyLhLUkHGXZIKMu6SVJBxl6SCjLskFWTcJakg4y5JBRl3SSrIz5bRqrYsn8nyRP+Pce7ZZ/T9GFpbjLtWrX5/aBjM/uexHMeRlpqXZSSpIOMuSQUZd0kqyLhLUkHGXZIKMu6SVJBxl6SCjLskFWTcJakg4y5JBRl3SSrIuEtSQcZdkgoy7pJU0IJxj4izIuK/IuL5iPhRRNzZxh+MiJ9GxMF2u6SNR0TcFxGHIuKFiLi03w9CkvR+3Xye+9vA9syciYgzgP0R8d227rbM/PYJ218FXNxunwTubz8lSctkwTP3nDXTFs9otzzFLtcBD7X9ngY2RsSFi5+qJKlbXX0TU0SsAw4AHwf+MTN/GBF/BkxExFeAJ4Gdmfk2sAl4Zc7uh9vY0RN+5w5gB8DQ0BCdTmeRD0XqD5+bWo26intmvgdcEhEbgUciYgtwB/AqcCawG7gduKvbA2fm7rYfo6OjOTY21tvMpeXwxGP43NRq1NOrZTLz58AU8OnMPNouvbwNfAO4rG12BLhozm6b25gkaZl082qZC9oZOxFxNvD7wI+PX0ePiACuB15su+wDbmyvmrkcOJaZR+f51ZKkPunmssyFwJ523f1DwLcy89GIeCoiLgACOAh8oW3/OHA1cAh4C7h56actSTqVBeOemS8AvzPP+PaTbJ/ALYufmiTpdPkOVUkqyLhLUkHGXZIKMu6SVJBxl6SCjLskFWTcJakg4y5JBRl3SSrIuEtSQcZdkgoy7pJUkHGXpIKMuyQVZNwlqSDjLkkFGXdJKsi4S1JBxl2SCjLuklSQcZekgoy7JBVk3CWpIOMuSQUZd0kqyLhLUkHrBz0BaTlFRO/73NP7cTKz952kJeSZu9aUzOzpNjU11fM+hl0rgXGXpIKMuyQVZNwlqSDjLkkFGXdJKsi4S1JBxl2SCjLuklRQrIQ3XETE/wEvD3oe0jzOB14f9CSkk/hYZl4w34oVEXdppYqIZzNzdNDzkHrlZRlJKsi4S1JBxl06td2DnoB0OrzmLkkFeeYuSQUZd0kqyLhrzYqIjRHx56e573BE/NFSz0laKsZda9lG4LTiDgwDxl0rlnHXWnY38FsRcTAi/j4ibouIZyLihYi4EyAifrctnxUR50TEjyJiS9v399q+fznQRyHNw1fLaM2KiGHg0czcEhFXAp8F/hQIYB/wd5n5g4j4W+As4GzgcGZ+NSLGgL/KzGsHMnlpAesHPQFphbiy3Z5ryx8GLgZ+ANwFPAP8CviLgcxO6pFxl2YF8NXMfGCedecxG/szmD2Df3M5JyadDq+5ay37BbCh3f9X4E8i4sMAEbEpIj7a1j0A/DXwTeCeefaVVhzP3LVmZeYbEfHvEfEi8F3gYeA/IwJgBvh8RHwaeCczH46IdcB/RMR24N+A9yLieeDBzLx3QA9Dmpd/UJWkgrwsI0kFGXdJKsi4S1JBxl2SCjLuklSQcZekgoy7JBX0/wx1Y9pk3PIIAAAAAElFTkSuQmCC",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "tweets_by_weekday.boxplot()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 175,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.bar?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 222,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " created_at \n",
- " text \n",
- " tweet_link \n",
- " tweet_mentions \n",
- " n_mentions \n",
- " week_day \n",
- " day_hour \n",
- " \n",
- " \n",
- " id \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 849636868052275200 \n",
- " 2017-04-05 14:56:29 \n",
- " b'And so the robots spared humanity ... https://t.co/v7JUJQWfCv' \n",
- " https://twitter.com/i/web/status/849636868052275200 \n",
- " [] \n",
- " 0 \n",
- " 2 \n",
- " 14 \n",
- " \n",
- " \n",
- " 848988730585096192 \n",
- " 2017-04-03 20:01:01 \n",
- " b\"@ForIn2020 @waltmossberg @mims @defcon_5 Exactly. Tesla is absurdly overvalued if based on the past, but that's irr\\xe2\\x80\\xa6 https://t.co/qQcTqkzgMl\" \n",
- " https://twitter.com/i/web/status/848988730585096192 \n",
- " [@ForIn2020, @waltmossberg, @mims, @defcon_5] \n",
- " 4 \n",
- " 0 \n",
- " 20 \n",
- " \n",
- " \n",
- " 848943072423497728 \n",
- " 2017-04-03 16:59:35 \n",
- " b'@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- " https://twitter.com/i/web/status/848943072423497728 \n",
- " [@waltmossberg, @mims, @defcon_5] \n",
- " 3 \n",
- " 0 \n",
- " 16 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " created_at \\\n",
- "id \n",
- "849636868052275200 2017-04-05 14:56:29 \n",
- "848988730585096192 2017-04-03 20:01:01 \n",
- "848943072423497728 2017-04-03 16:59:35 \n",
- "\n",
- " text \\\n",
- "id \n",
- "849636868052275200 b'And so the robots spared humanity ... https://t.co/v7JUJQWfCv' \n",
- "848988730585096192 b\"@ForIn2020 @waltmossberg @mims @defcon_5 Exactly. Tesla is absurdly overvalued if based on the past, but that's irr\\xe2\\x80\\xa6 https://t.co/qQcTqkzgMl\" \n",
- "848943072423497728 b'@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- "\n",
- " tweet_link \\\n",
- "id \n",
- "849636868052275200 https://twitter.com/i/web/status/849636868052275200 \n",
- "848988730585096192 https://twitter.com/i/web/status/848988730585096192 \n",
- "848943072423497728 https://twitter.com/i/web/status/848943072423497728 \n",
- "\n",
- " tweet_mentions n_mentions \\\n",
- "id \n",
- "849636868052275200 [] 0 \n",
- "848988730585096192 [@ForIn2020, @waltmossberg, @mims, @defcon_5] 4 \n",
- "848943072423497728 [@waltmossberg, @mims, @defcon_5] 3 \n",
- "\n",
- " week_day day_hour \n",
- "id \n",
- "849636868052275200 2 14 \n",
- "848988730585096192 0 20 \n",
- "848943072423497728 0 16 "
- ]
- },
- "execution_count": 222,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 351,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " day_hour \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " count \n",
- " 2819.000000 \n",
- " \n",
- " \n",
- " mean \n",
- " 12.782547 \n",
- " \n",
- " \n",
- " std \n",
- " 7.611198 \n",
- " \n",
- " \n",
- " min \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 25% \n",
- " 5.000000 \n",
- " \n",
- " \n",
- " 50% \n",
- " 15.000000 \n",
- " \n",
- " \n",
- " 75% \n",
- " 19.000000 \n",
- " \n",
- " \n",
- " max \n",
- " 23.000000 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " day_hour\n",
- "count 2819.000000\n",
- "mean 12.782547\n",
- "std 7.611198\n",
- "min 0.000000\n",
- "25% 5.000000\n",
- "50% 15.000000\n",
- "75% 19.000000\n",
- "max 23.000000"
- ]
- },
- "execution_count": 351,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[['day_hour']].describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 354,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "day_hour 5.0\n",
- "Name: 0.25, dtype: float64"
- ]
- },
- "execution_count": 354,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[['day_hour']].quantile(.25)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 247,
- "metadata": {},
- "outputs": [],
- "source": [
- "df.boxplot?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 251,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "%matplotlib inline\n",
- "\n",
- "df[['day_hour', 'week_day_name']].boxplot(\n",
- " by='week_day_name',\n",
- " grid=False,\n",
- " figsize=(8,6),\n",
- " fontsize=10\n",
- ")\n",
- "\n",
- "# give a title to the plot\n",
- "plt.title('')\n",
- "\n",
- "# give a label to the axes\n",
- "plt.xlabel(\"Day of the week\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 252,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "%matplotlib inline\n",
- "\n",
- "df[['day_hour', 'week_day']].boxplot(\n",
- " by='week_day',\n",
- " grid=True, # just to show the difference with/without\n",
- " figsize=(8,6),\n",
- " fontsize=10\n",
- ")\n",
- "\n",
- "# give a title to the plot\n",
- "plt.title('')\n",
- "\n",
- "# give a label to the axes\n",
- "plt.xlabel(\"Day of the week\")\n",
- "plt.show()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.4"
- },
- "toc": {
- "base_numbering": 1,
- "nav_menu": {},
- "number_sections": false,
- "sideBar": true,
- "skip_h1_title": false,
- "title_cell": "Table of Contents",
- "title_sidebar": "Contents",
- "toc_cell": false,
- "toc_position": {
- "height": "calc(100% - 180px)",
- "left": "10px",
- "top": "150px",
- "width": "384px"
- },
- "toc_section_display": true,
- "toc_window_display": true
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notebooks/3.1 Exploratory data analysis basics.ipynb b/notebooks/3.1 Exploratory data analysis basics.ipynb
index 8089ec7..f44c8e7 100644
--- a/notebooks/3.1 Exploratory data analysis basics.ipynb
+++ b/notebooks/3.1 Exploratory data analysis basics.ipynb
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -27,13 +27,202 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_pickle(\"./musk_tweets_enhanced.pkl\")"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 2819 entries, 849636868052275200 to 15434727182\n",
+ "Data columns (total 8 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 created_at 2819 non-null datetime64[ns]\n",
+ " 1 text 2819 non-null object \n",
+ " 2 tweet_link 2819 non-null object \n",
+ " 3 tweet_mentions 2819 non-null object \n",
+ " 4 n_mentions 2819 non-null int64 \n",
+ " 5 week_day_name 2819 non-null object \n",
+ " 6 week_day 2819 non-null int64 \n",
+ " 7 day_hour 2819 non-null int64 \n",
+ "dtypes: datetime64[ns](1), int64(3), object(4)\n",
+ "memory usage: 198.2+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " created_at \n",
+ " text \n",
+ " tweet_link \n",
+ " tweet_mentions \n",
+ " n_mentions \n",
+ " week_day_name \n",
+ " week_day \n",
+ " day_hour \n",
+ " \n",
+ " \n",
+ " id \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 849636868052275200 \n",
+ " 2017-04-05 14:56:29 \n",
+ " b'And so the robots spared humanity ... https:... \n",
+ " https://twitter.com/i/web/status/8496368680522... \n",
+ " [] \n",
+ " 0 \n",
+ " Wednesday \n",
+ " 2 \n",
+ " 14 \n",
+ " \n",
+ " \n",
+ " 848988730585096192 \n",
+ " 2017-04-03 20:01:01 \n",
+ " b\"@ForIn2020 @waltmossberg @mims @defcon_5 Exa... \n",
+ " https://twitter.com/i/web/status/8489887305850... \n",
+ " [@ForIn2020, @waltmossberg, @mims, @defcon_5] \n",
+ " 4 \n",
+ " Monday \n",
+ " 0 \n",
+ " 20 \n",
+ " \n",
+ " \n",
+ " 848943072423497728 \n",
+ " 2017-04-03 16:59:35 \n",
+ " b'@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
+ " https://twitter.com/i/web/status/8489430724234... \n",
+ " [@waltmossberg, @mims, @defcon_5] \n",
+ " 3 \n",
+ " Monday \n",
+ " 0 \n",
+ " 16 \n",
+ " \n",
+ " \n",
+ " 848935705057280001 \n",
+ " 2017-04-03 16:30:19 \n",
+ " b'Stormy weather in Shortville ...' \n",
+ " https://twitter.com/i/web/status/8489357050572... \n",
+ " [] \n",
+ " 0 \n",
+ " Monday \n",
+ " 0 \n",
+ " 16 \n",
+ " \n",
+ " \n",
+ " 848416049573658624 \n",
+ " 2017-04-02 06:05:23 \n",
+ " b\"@DaveLeeBBC @verge Coal is dying due to nat ... \n",
+ " https://twitter.com/i/web/status/8484160495736... \n",
+ " [@DaveLeeBBC, @verge] \n",
+ " 2 \n",
+ " Sunday \n",
+ " 6 \n",
+ " 6 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " created_at \\\n",
+ "id \n",
+ "849636868052275200 2017-04-05 14:56:29 \n",
+ "848988730585096192 2017-04-03 20:01:01 \n",
+ "848943072423497728 2017-04-03 16:59:35 \n",
+ "848935705057280001 2017-04-03 16:30:19 \n",
+ "848416049573658624 2017-04-02 06:05:23 \n",
+ "\n",
+ " text \\\n",
+ "id \n",
+ "849636868052275200 b'And so the robots spared humanity ... https:... \n",
+ "848988730585096192 b\"@ForIn2020 @waltmossberg @mims @defcon_5 Exa... \n",
+ "848943072423497728 b'@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
+ "848935705057280001 b'Stormy weather in Shortville ...' \n",
+ "848416049573658624 b\"@DaveLeeBBC @verge Coal is dying due to nat ... \n",
+ "\n",
+ " tweet_link \\\n",
+ "id \n",
+ "849636868052275200 https://twitter.com/i/web/status/8496368680522... \n",
+ "848988730585096192 https://twitter.com/i/web/status/8489887305850... \n",
+ "848943072423497728 https://twitter.com/i/web/status/8489430724234... \n",
+ "848935705057280001 https://twitter.com/i/web/status/8489357050572... \n",
+ "848416049573658624 https://twitter.com/i/web/status/8484160495736... \n",
+ "\n",
+ " tweet_mentions n_mentions \\\n",
+ "id \n",
+ "849636868052275200 [] 0 \n",
+ "848988730585096192 [@ForIn2020, @waltmossberg, @mims, @defcon_5] 4 \n",
+ "848943072423497728 [@waltmossberg, @mims, @defcon_5] 3 \n",
+ "848935705057280001 [] 0 \n",
+ "848416049573658624 [@DaveLeeBBC, @verge] 2 \n",
+ "\n",
+ " week_day_name week_day day_hour \n",
+ "id \n",
+ "849636868052275200 Wednesday 2 14 \n",
+ "848988730585096192 Monday 0 20 \n",
+ "848943072423497728 Monday 0 16 \n",
+ "848935705057280001 Monday 0 16 \n",
+ "848416049573658624 Sunday 6 6 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -43,7 +232,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -137,7 +326,7 @@
"max 6.000000 6.000000 23.000000"
]
},
- "execution_count": 3,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -152,7 +341,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -366,7 +555,7 @@
"std 0.859091 NaN 1.946637 7.611198 "
]
},
- "execution_count": 6,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -380,7 +569,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -483,7 +672,7 @@
"std NaN 0.859091 1.946637 7.611198"
]
},
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -496,7 +685,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -512,7 +701,7 @@
"Name: created_at, dtype: object"
]
},
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -523,7 +712,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -702,7 +891,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -722,7 +911,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
@@ -798,7 +987,7 @@
"6 2"
]
},
- "execution_count": 15,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -807,6 +996,33 @@
"df.groupby(['n_mentions'])[['text']].count()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1 1231\n",
+ "0 1145\n",
+ "2 329\n",
+ "3 78\n",
+ "4 28\n",
+ "5 6\n",
+ "6 2\n",
+ "Name: n_mentions, dtype: int64"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['n_mentions'].value_counts()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 16,
@@ -837,7 +1053,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
@@ -865,7 +1081,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
@@ -874,7 +1090,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 28,
"metadata": {},
"outputs": [
{
@@ -928,7 +1144,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
@@ -937,7 +1153,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 30,
"metadata": {},
"outputs": [
{
@@ -968,7 +1184,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 31,
"metadata": {},
"outputs": [
{
@@ -1088,7 +1304,7 @@
"848943072423497728 Monday 0 16 2017 "
]
},
- "execution_count": 22,
+ "execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
@@ -1099,7 +1315,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 19,
"metadata": {},
"outputs": [
{
@@ -1143,7 +1359,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@@ -1152,7 +1368,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@@ -1161,7 +1377,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -1178,7 +1394,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 22,
"metadata": {},
"outputs": [
{
@@ -1227,7 +1443,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
@@ -1303,7 +1519,7 @@
"6 422"
]
},
- "execution_count": 28,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -1314,7 +1530,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
@@ -1390,7 +1606,7 @@
"max 530.000000"
]
},
- "execution_count": 29,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -1401,7 +1617,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
@@ -1410,7 +1626,7 @@
""
]
},
- "execution_count": 30,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
@@ -1433,147 +1649,7 @@
},
{
"cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.bar?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " created_at \n",
- " text \n",
- " tweet_link \n",
- " tweet_mentions \n",
- " n_mentions \n",
- " week_day_name \n",
- " week_day \n",
- " day_hour \n",
- " year \n",
- " \n",
- " \n",
- " id \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 849636868052275200 \n",
- " 2017-04-05 14:56:29 \n",
- " b'And so the robots spared humanity ... https:... \n",
- " https://twitter.com/i/web/status/8496368680522... \n",
- " [] \n",
- " 0 \n",
- " Wednesday \n",
- " 2 \n",
- " 14 \n",
- " 2017 \n",
- " \n",
- " \n",
- " 848988730585096192 \n",
- " 2017-04-03 20:01:01 \n",
- " b\"@ForIn2020 @waltmossberg @mims @defcon_5 Exa... \n",
- " https://twitter.com/i/web/status/8489887305850... \n",
- " [@ForIn2020, @waltmossberg, @mims, @defcon_5] \n",
- " 4 \n",
- " Monday \n",
- " 0 \n",
- " 20 \n",
- " 2017 \n",
- " \n",
- " \n",
- " 848943072423497728 \n",
- " 2017-04-03 16:59:35 \n",
- " b'@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- " https://twitter.com/i/web/status/8489430724234... \n",
- " [@waltmossberg, @mims, @defcon_5] \n",
- " 3 \n",
- " Monday \n",
- " 0 \n",
- " 16 \n",
- " 2017 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " created_at \\\n",
- "id \n",
- "849636868052275200 2017-04-05 14:56:29 \n",
- "848988730585096192 2017-04-03 20:01:01 \n",
- "848943072423497728 2017-04-03 16:59:35 \n",
- "\n",
- " text \\\n",
- "id \n",
- "849636868052275200 b'And so the robots spared humanity ... https:... \n",
- "848988730585096192 b\"@ForIn2020 @waltmossberg @mims @defcon_5 Exa... \n",
- "848943072423497728 b'@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- "\n",
- " tweet_link \\\n",
- "id \n",
- "849636868052275200 https://twitter.com/i/web/status/8496368680522... \n",
- "848988730585096192 https://twitter.com/i/web/status/8489887305850... \n",
- "848943072423497728 https://twitter.com/i/web/status/8489430724234... \n",
- "\n",
- " tweet_mentions n_mentions \\\n",
- "id \n",
- "849636868052275200 [] 0 \n",
- "848988730585096192 [@ForIn2020, @waltmossberg, @mims, @defcon_5] 4 \n",
- "848943072423497728 [@waltmossberg, @mims, @defcon_5] 3 \n",
- "\n",
- " week_day_name week_day day_hour year \n",
- "id \n",
- "849636868052275200 Wednesday 2 14 2017 \n",
- "848988730585096192 Monday 0 20 2017 \n",
- "848943072423497728 Monday 0 16 2017 "
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
+ "execution_count": 40,
"metadata": {},
"outputs": [
{
@@ -1649,7 +1725,7 @@
"max 23.000000"
]
},
- "execution_count": 33,
+ "execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
@@ -1660,7 +1736,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
@@ -1670,7 +1746,7 @@
"Name: 0.25, dtype: float64"
]
},
- "execution_count": 34,
+ "execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
@@ -1681,7 +1757,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
@@ -1690,7 +1766,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 43,
"metadata": {},
"outputs": [
{
@@ -1726,7 +1802,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 44,
"metadata": {},
"outputs": [
{
diff --git a/notebooks/3.2 Exploratory data analysis II and working with texts.ipynb b/notebooks/3.2 Exploratory data analysis II and working with texts.ipynb
deleted file mode 100644
index ca8a132..0000000
--- a/notebooks/3.2 Exploratory data analysis II and working with texts.ipynb
+++ /dev/null
@@ -1,2423 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 3.2 Exploratory data analysis and working with texts\n",
- "\n",
- "In this notebook, we learn about:\n",
- "1. descriptive statistics to explore data;\n",
- "2. working with texts (hints)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Part 1: descriptive statistics\n",
- "\n",
- "*The goal of exploratory data analysis is to develop an understanding of your data. EDA is fundamentally a creative process. And like most creative processes, the key to asking quality questions is to generate a large quantity of questions.* \n",
- "\n",
- "Key questions:\n",
- "* Which kind of variation occurs within variables?\n",
- "* Which kind of co-variation occurs between variables?\n",
- "\n",
- "https://r4ds.had.co.nz/exploratory-data-analysis.html"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# imports\n",
- "\n",
- "import os, codecs\n",
- "import pandas as pd\n",
- "import numpy as np\n",
- "import seaborn as sns\n",
- "import matplotlib.pyplot as plt"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Import the dataset\n",
- "Let us import the Venetian apprenticeship contracts dataset in memory."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "root_folder = \"../data/apprenticeship_venice/\"\n",
- "df_contracts = pd.read_csv(codecs.open(os.path.join(root_folder,\"professions_data.csv\"), encoding=\"utf8\"), sep=\";\")\n",
- "df_professions = pd.read_csv(codecs.open(os.path.join(root_folder,\"professions_classification.csv\"), encoding=\"utf8\"), sep=\",\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's take another look to the dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 9653 entries, 0 to 9652\n",
- "Data columns (total 47 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 page_title 9653 non-null object \n",
- " 1 register 9653 non-null object \n",
- " 2 annual_salary 7870 non-null float64\n",
- " 3 a_profession 9653 non-null object \n",
- " 4 profession_code_strict 9618 non-null object \n",
- " 5 profession_code_gen 9614 non-null object \n",
- " 6 profession_cat 9597 non-null object \n",
- " 7 corporation 9350 non-null object \n",
- " 8 keep_profession_a 9653 non-null int64 \n",
- " 9 complete_profession_a 9653 non-null int64 \n",
- " 10 enrolmentY 9628 non-null float64\n",
- " 11 enrolmentM 9631 non-null float64\n",
- " 12 startY 9533 non-null float64\n",
- " 13 startM 9539 non-null float64\n",
- " 14 length 9645 non-null float64\n",
- " 15 has_fled 9653 non-null int64 \n",
- " 16 m_profession 9535 non-null object \n",
- " 17 m_profession_code_strict 9508 non-null object \n",
- " 18 m_profession_code_gen 9506 non-null object \n",
- " 19 m_profession_cat 9489 non-null object \n",
- " 20 m_corporation 9276 non-null object \n",
- " 21 keep_profession_m 9653 non-null int64 \n",
- " 22 complete_profession_m 9653 non-null int64 \n",
- " 23 m_gender 9554 non-null float64\n",
- " 24 m_name 9623 non-null object \n",
- " 25 m_surname 6960 non-null object \n",
- " 26 m_patronimic 2620 non-null object \n",
- " 27 m_atelier 1434 non-null object \n",
- " 28 m_coords 9639 non-null object \n",
- " 29 a_name 9653 non-null object \n",
- " 30 a_age 9303 non-null float64\n",
- " 31 a_gender 9522 non-null float64\n",
- " 32 a_geo_origins 7149 non-null object \n",
- " 33 a_geo_origins_std 4636 non-null object \n",
- " 34 a_coords 9610 non-null object \n",
- " 35 a_quondam 7848 non-null float64\n",
- " 36 accommodation_master 9653 non-null int64 \n",
- " 37 personal_care_master 9653 non-null int64 \n",
- " 38 clothes_master 9653 non-null int64 \n",
- " 39 generic_expenses_master 9653 non-null int64 \n",
- " 40 salary_in_kind_master 9653 non-null int64 \n",
- " 41 pledge_goods_master 9653 non-null int64 \n",
- " 42 pledge_money_master 9653 non-null int64 \n",
- " 43 salary_master 9653 non-null int64 \n",
- " 44 female_guarantor 9653 non-null int64 \n",
- " 45 period_cat 7891 non-null float64\n",
- " 46 incremental_salary 9653 non-null int64 \n",
- "dtypes: float64(11), int64(15), object(21)\n",
- "memory usage: 3.5+ MB\n"
- ]
- }
- ],
- "source": [
- "df_contracts.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " page_title \n",
- " register \n",
- " annual_salary \n",
- " a_profession \n",
- " profession_code_strict \n",
- " profession_code_gen \n",
- " profession_cat \n",
- " corporation \n",
- " keep_profession_a \n",
- " complete_profession_a \n",
- " ... \n",
- " personal_care_master \n",
- " clothes_master \n",
- " generic_expenses_master \n",
- " salary_in_kind_master \n",
- " pledge_goods_master \n",
- " pledge_money_master \n",
- " salary_master \n",
- " female_guarantor \n",
- " period_cat \n",
- " incremental_salary \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " Carlo Della sosta (Orese) 1592-08-03 \n",
- " asv, giustizia vecchia, accordi dei garzoni, 1... \n",
- " NaN \n",
- " orese \n",
- " orese \n",
- " orefice \n",
- " orefice \n",
- " Oresi \n",
- " 1 \n",
- " 1 \n",
- " ... \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " NaN \n",
- " 0 \n",
- " \n",
- " \n",
- " 1 \n",
- " Antonio quondam Andrea (squerariol) 1583-01-09 \n",
- " asv, giustizia vecchia, accordi dei garzoni, 1... \n",
- " 12.5 \n",
- " squerariol \n",
- " squerariol \n",
- " lavori allo squero \n",
- " lavori allo squero \n",
- " Squerarioli \n",
- " 1 \n",
- " 1 \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 0 \n",
- " 1.0 \n",
- " 0 \n",
- " \n",
- " \n",
- " 2 \n",
- " Cristofollo di Zuane (batioro in carta) 1591-0... \n",
- " asv, giustizia vecchia, accordi dei garzoni, 1... \n",
- " NaN \n",
- " batioro \n",
- " batioro \n",
- " battioro \n",
- " fabbricatore di foglie/fili/cordelle d'oro o a... \n",
- " Battioro \n",
- " 1 \n",
- " 1 \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " NaN \n",
- " 0 \n",
- " \n",
- " \n",
- "
\n",
- "
3 rows Ć 47 columns
\n",
- "
"
- ],
- "text/plain": [
- " page_title \\\n",
- "0 Carlo Della sosta (Orese) 1592-08-03 \n",
- "1 Antonio quondam Andrea (squerariol) 1583-01-09 \n",
- "2 Cristofollo di Zuane (batioro in carta) 1591-0... \n",
- "\n",
- " register annual_salary \\\n",
- "0 asv, giustizia vecchia, accordi dei garzoni, 1... NaN \n",
- "1 asv, giustizia vecchia, accordi dei garzoni, 1... 12.5 \n",
- "2 asv, giustizia vecchia, accordi dei garzoni, 1... NaN \n",
- "\n",
- " a_profession profession_code_strict profession_code_gen \\\n",
- "0 orese orese orefice \n",
- "1 squerariol squerariol lavori allo squero \n",
- "2 batioro batioro battioro \n",
- "\n",
- " profession_cat corporation \\\n",
- "0 orefice Oresi \n",
- "1 lavori allo squero Squerarioli \n",
- "2 fabbricatore di foglie/fili/cordelle d'oro o a... Battioro \n",
- "\n",
- " keep_profession_a complete_profession_a ... personal_care_master \\\n",
- "0 1 1 ... 1 \n",
- "1 1 1 ... 0 \n",
- "2 1 1 ... 0 \n",
- "\n",
- " clothes_master generic_expenses_master salary_in_kind_master \\\n",
- "0 1 1 0 \n",
- "1 0 1 0 \n",
- "2 0 0 0 \n",
- "\n",
- " pledge_goods_master pledge_money_master salary_master female_guarantor \\\n",
- "0 0 0 0 0 \n",
- "1 0 0 1 0 \n",
- "2 0 0 0 0 \n",
- "\n",
- " period_cat incremental_salary \n",
- "0 NaN 0 \n",
- "1 1.0 0 \n",
- "2 NaN 0 \n",
- "\n",
- "[3 rows x 47 columns]"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_contracts.head(3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['page_title', 'register', 'annual_salary', 'a_profession',\n",
- " 'profession_code_strict', 'profession_code_gen', 'profession_cat',\n",
- " 'corporation', 'keep_profession_a', 'complete_profession_a',\n",
- " 'enrolmentY', 'enrolmentM', 'startY', 'startM', 'length', 'has_fled',\n",
- " 'm_profession', 'm_profession_code_strict', 'm_profession_code_gen',\n",
- " 'm_profession_cat', 'm_corporation', 'keep_profession_m',\n",
- " 'complete_profession_m', 'm_gender', 'm_name', 'm_surname',\n",
- " 'm_patronimic', 'm_atelier', 'm_coords', 'a_name', 'a_age', 'a_gender',\n",
- " 'a_geo_origins', 'a_geo_origins_std', 'a_coords', 'a_quondam',\n",
- " 'accommodation_master', 'personal_care_master', 'clothes_master',\n",
- " 'generic_expenses_master', 'salary_in_kind_master',\n",
- " 'pledge_goods_master', 'pledge_money_master', 'salary_master',\n",
- " 'female_guarantor', 'period_cat', 'incremental_salary'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_contracts.columns"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Every row represents an apprenticeship contract. Contracts were registered both at the guild's and at a public office. This is a sample of contracts from a much larger set of records.\n",
- "\n",
- "Some of the variables we will work with are:\n",
- "* `annual_salary`: the annual salary paid to the apprencice, if any (in Venetian ducats).\n",
- "* `a_profession` to `corporation`: increasingly generic classifications for the apprentice's stated profession.\n",
- "* `startY` and `enrolmentY`: contract start and registration year respectively.\n",
- "* `length`: of the contract, in years.\n",
- "* `m_gender` and `a_gender`: of master and apprentice respectively.\n",
- "* `a_age`: age of the apprentice at entry, in years.\n",
- "* `female_guarantor`: if at least one of the contract's guarantors was female, boolean."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Trascrizione \n",
- " Standard \n",
- " Gruppo 0 \n",
- " Gruppo 1 \n",
- " Gruppo 2 \n",
- " Gruppo 3 \n",
- " Gruppo 4 \n",
- " Corporazione \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " al negotio del libraro \n",
- " librer \n",
- " libraio \n",
- " librai - diverse specializzazioni \n",
- " stampa \n",
- " altre lavorazioni manifatturiere \n",
- " beni \n",
- " libreri, stampatori e ligadori \n",
- " \n",
- " \n",
- " 1 \n",
- " arte de far arpicordi \n",
- " arte de far arpicordi \n",
- " fabbricatore di arpicordi \n",
- " fabbricatore di strumenti musicali \n",
- " musica \n",
- " altri servizi \n",
- " servizi \n",
- " NaN \n",
- " \n",
- " \n",
- " 2 \n",
- " arte de' colori \n",
- " arte dei colori \n",
- " fabbricazione/vendita di colori \n",
- " colori \n",
- " colori \n",
- " decorazioni e mestieri dell'arte \n",
- " beni \n",
- " spezieri \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Trascrizione Standard \\\n",
- "0 al negotio del libraro librer \n",
- "1 arte de far arpicordi arte de far arpicordi \n",
- "2 arte de' colori arte dei colori \n",
- "\n",
- " Gruppo 0 Gruppo 1 \\\n",
- "0 libraio librai - diverse specializzazioni \n",
- "1 fabbricatore di arpicordi fabbricatore di strumenti musicali \n",
- "2 fabbricazione/vendita di colori colori \n",
- "\n",
- " Gruppo 2 Gruppo 3 Gruppo 4 \\\n",
- "0 stampa altre lavorazioni manifatturiere beni \n",
- "1 musica altri servizi servizi \n",
- "2 colori decorazioni e mestieri dell'arte beni \n",
- "\n",
- " Corporazione \n",
- "0 libreri, stampatori e ligadori \n",
- "1 NaN \n",
- "2 spezieri "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_professions.head(3)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The professions data frame contains a classification system for each profession as found in the records (transcription, first column). The last column is the guild (or corporation) which governed the given profession. This work was performed manually by historians. We don't use it here as the classifications we need are already part of the main dataframe."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Questions\n",
- "\n",
- "* Plot the distribution (histogram) of the apprentices' age, contract length, annual salary and start year.\n",
- "* Calculate the proportion of female apprentices and masters, and of contracts with a female guarantor.\n",
- "* How likely it is for a female apprentice to have a female master? And for a male apprentice?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "salaries_male_guarantor = df_contracts[\n",
- " df_contracts.female_guarantor == 0\n",
- "].annual_salary"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "salaries_female_guarantor = df_contracts[\n",
- " df_contracts.female_guarantor == 1\n",
- "].annual_salary"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "salaries_male_guarantor.hist()\n",
- "salaries_female_guarantor.hist()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAATKUlEQVR4nO3df6zd9X3f8edrkKQSzjCMzGKG1WRyK9GgUbgCpKbVtbKCIVudbFMEQsTOD7mTQErUTIvTqAM1jeRuSapFzeicYoWsaW6ZkigWkFHXwkP5gwbMHIwhFJc4G1eurQZq4iTKRvbeH+dr63Bzr+/18bnnnOvP8yEdne/5fH+c1/ne69c553u+9zhVhSSpDX9v3AEkSaNj6UtSQyx9SWqIpS9JDbH0Jakh5487wOlccskltW7duoHW/eEPf8gFF1ww3EDLZKVkNefwrZSs5hy+5cy6b9++v62qt8w7s6om9nLttdfWoB599NGB1x21lZLVnMO3UrKac/iWMyvwZC3Qqx7ekaSGWPqS1BBLX5IaYulLUkMsfUlqiKUvSQ2x9CWpIZa+JDXE0pekhkz01zAM07ptD52aPrz9nWNMIknj4yt9SWqIpS9JDbH0Jakhlr4kNcTSl6SGWPqS1BBLX5IaYulLUkMsfUlqiKUvSQ2x9CWpIZa+JDXE0pekhlj6ktQQS1+SGmLpS1JDLH1JaoilL0kNsfQlqSGLln6Sy5M8muTZJAeTfKgbvyfJbJL93eWWvnU+luRQkueT3NQ3vrEbO5Rk2/I8JEnSQpbyH6O/Bnykqp5K8mZgX5Ld3bw/qKpP9S+c5ErgVuCXgH8E/EWSX+hmfw74deAl4Ikku6rq2WE8EEnS4hYt/ao6Ahzppn+Q5Dlg7WlW2QTMVNVPgO8mOQRc1807VFUvAiSZ6Za19CVpRFJVS184WQc8BrwN+C1gC/Aq8CS9dwOvJPlD4PGq+pNunfuAb3Sb2FhVH+zG7wCur6q75tzHVmArwJo1a66dmZkZ6IGdOHGCVatWnbp9YPb4qemr1l440DaXy9ysk8qcw7dSsppz+JYz64YNG/ZV1dR885ZyeAeAJKuArwAfrqpXk9wLfAKo7vrTwPvPNmxV7QB2AExNTdX09PRA29m7dy/9627Z9tCp6cO3D7bN5TI366Qy5/CtlKzmHL5xZV1S6Sd5A73C/1JVfRWgqo72zf888GB3cxa4vG/1y7oxTjMuSRqBpZy9E+A+4Lmq+kzf+KV9i70beKab3gXcmuRNSa4A1gPfAp4A1ie5Iskb6X3Yu2s4D0OStBRLeaX/K8AdwIEk+7ux3wZuS3I1vcM7h4HfBKiqg0keoPcB7WvAnVX1U4AkdwGPAOcBO6vq4NAeiSRpUUs5e+ebQOaZ9fBp1vkk8Ml5xh8+3XqSpOXlX+RKUkMsfUlqiKUvSQ2x9CWpIZa+JDXE0pekhlj6ktQQS1+SGmLpS1JDLH1JaoilL0kNsfQlqSGWviQ1xNKXpIZY+pLUEEtfkhpi6UtSQyx9SWqIpS9JDbH0Jakhi/7H6OeiddseOjV9ePs7x5hEkkbLV/qS1BBLX5IaYulLUkMsfUlqiKUvSQ2x9CWpIZa+JDVk0dJPcnmSR5M8m+Rgkg914xcn2Z3khe76om48ST6b5FCSp5Nc07etzd3yLyTZvHwPS5I0n6W80n8N+EhVXQncANyZ5EpgG7CnqtYDe7rbADcD67vLVuBe6D1JAHcD1wPXAXeffKKQJI3GoqVfVUeq6qlu+gfAc8BaYBNwf7fY/cC7uulNwBer53FgdZJLgZuA3VX1clW9AuwGNg7zwUiSTi9VtfSFk3XAY8DbgP9VVau78QCvVNXqJA8C26vqm928PcBHgWng56rq97rx3wF+XFWfmnMfW+m9Q2DNmjXXzszMDPTATpw4wapVq07dPjB7fN7lrlp74UDbH6a5WSeVOYdvpWQ15/AtZ9YNGzbsq6qp+eYt+bt3kqwCvgJ8uKpe7fV8T1VVkqU/e5xGVe0AdgBMTU3V9PT0QNvZu3cv/etu6fu+nX6Hbx9s+8M0N+ukMufwrZSs5hy+cWVd0tk7Sd5Ar/C/VFVf7YaPdodt6K6PdeOzwOV9q1/WjS00LkkakaWcvRPgPuC5qvpM36xdwMkzcDYDX+8bf293Fs8NwPGqOgI8AtyY5KLuA9wbuzFJ0ogs5fDOrwB3AAeS7O/GfhvYDjyQ5APA94D3dPMeBm4BDgE/At4HUFUvJ/kE8ES33O9W1cvDeBCSpKVZtPS7D2SzwOx3zLN8AXcusK2dwM4zCShJGh7/IleSGmLpS1JDLH1JaoilL0kNsfQlqSGWviQ1xNKXpIZY+pLUEEtfkhpi6UtSQyx9SWqIpS9JDbH0Jakhlr4kNcTSl6SGWPqS1BBLX5IaYulLUkMsfUlqiKUvSQ2x9CWpIZa+JDXE0pekhlj6ktQQS1+SGmLpS1JDLH1JaoilL0kNWbT0k+xMcizJM31j9ySZTbK/u9zSN+9jSQ4leT7JTX3jG7uxQ0m2Df+hSJIWs5RX+l8ANs4z/gdVdXV3eRggyZXArcAvdev85yTnJTkP+BxwM3AlcFu3rCRphM5fbIGqeizJuiVubxMwU1U/Ab6b5BBwXTfvUFW9CJBkplv22TOPLEkaVKpq8YV6pf9gVb2tu30PsAV4FXgS+EhVvZLkD4HHq+pPuuXuA77RbWZjVX2wG78DuL6q7prnvrYCWwHWrFlz7czMzEAP7MSJE6xaterU7QOzx+dd7qq1Fw60/WGam3VSmXP4VkpWcw7fcmbdsGHDvqqamm/eoq/0F3Av8AmguutPA+8fcFuvU1U7gB0AU1NTNT09PdB29u7dS/+6W7Y9NO9yh28fbPvDNDfrpDLn8K2UrOYcvnFlHaj0q+royekknwce7G7OApf3LXpZN8ZpxiVJIzLQKZtJLu27+W7g5Jk9u4Bbk7wpyRXAeuBbwBPA+iRXJHkjvQ97dw0eW5I0iEVf6Sf5MjANXJLkJeBuYDrJ1fQO7xwGfhOgqg4meYDeB7SvAXdW1U+77dwFPAKcB+ysqoPDfjCSpNNbytk7t80zfN9plv8k8Ml5xh8GHj6jdJKkofIvciWpIZa+JDXE0pekhlj6ktQQS1+SGmLpS1JDLH1JaoilL0kNsfQlqSGWviQ1xNKXpIZY+pLUEEtfkhpi6UtSQyx9SWqIpS9JDbH0JakhA/3H6OeSddseOjV9ePs7x5hEkpafr/QlqSGWviQ1xNKXpIZY+pLUEEtfkhpi6UtSQyx9SWqIpS9JDbH0Jakhi5Z+kp1JjiV5pm/s4iS7k7zQXV/UjSfJZ5McSvJ0kmv61tncLf9Cks3L83AkSaezlFf6XwA2zhnbBuypqvXAnu42wM3A+u6yFbgXek8SwN3A9cB1wN0nnygkSaOzaOlX1WPAy3OGNwH3d9P3A+/qG/9i9TwOrE5yKXATsLuqXq6qV4Dd/OwTiSRpmaWqFl8oWQc8WFVv627/XVWt7qYDvFJVq5M8CGyvqm928/YAHwWmgZ+rqt/rxn8H+HFVfWqe+9pK710Ca9asuXZmZmagB3bixAlWrVp16vaB2eOLrnPV2gsHuq+zNTfrpDLn8K2UrOYcvuXMumHDhn1VNTXfvLP+ls2qqiSLP3MsfXs7gB0AU1NTNT09PdB29u7dS/+6W/q+TXMhh28f7L7O1tysk8qcw7dSsppz+MaVddCzd452h23oro9147PA5X3LXdaNLTQuSRqhQUt/F3DyDJzNwNf7xt/bncVzA3C8qo4AjwA3Jrmo+wD3xm5MkjRCix7eSfJlesfkL0nyEr2zcLYDDyT5APA94D3d4g8DtwCHgB8B7wOoqpeTfAJ4olvud6tq7ofDkqRltmjpV9VtC8x6xzzLFnDnAtvZCew8o3SSpKHyL3IlqSGWviQ1xNKXpIZY+pLUEEtfkhpi6UtSQyx9SWqIpS9JDbH0Jakhlr4kNcTSl6SGWPqS1BBLX5IaYulLUkMsfUlqiKUvSQ2x9CWpIZa+JDXE0pekhlj6ktQQS1+SGmLpS1JDLH1JaoilL0kNsfQlqSGWviQ1xNKXpIZY+pLUkLMq/SSHkxxIsj/Jk93YxUl2J3mhu76oG0+SzyY5lOTpJNcM4wFIkpZuGK/0N1TV1VU11d3eBuypqvXAnu42wM3A+u6yFbh3CPctSToDy3F4ZxNwfzd9P/CuvvEvVs/jwOokly7D/UuSFpCqGnzl5LvAK0AB/6WqdiT5u6pa3c0P8EpVrU7yILC9qr7ZzdsDfLSqnpyzza303gmwZs2aa2dmZgbKduLECVatWnXq9oHZ44uuc9XaCwe6r7M1N+ukMufwrZSs5hy+5cy6YcOGfX1HX17n/LPc9turajbJPwR2J/lO/8yqqiRn9KxSVTuAHQBTU1M1PT09ULC9e/fSv+6WbQ8tus7h2we7r7M1N+ukMufwrZSs5hy+cWU9q8M7VTXbXR8DvgZcBxw9edimuz7WLT4LXN63+mXdmCRpRAYu/SQXJHnzyWngRuAZYBewuVtsM/D1bnoX8N7uLJ4bgONVdWTg5JKkM3Y2h3fWAF/rHbbnfOBPq+q/J3kCeCDJB4DvAe/pln8YuAU4BPwIeN9Z3LckaQADl35VvQj803nGvw+8Y57xAu4c9P4kSWfPv8iVpIac7dk755R1fWf4HN7+zjEmkaTl4St9SWrIOf1Kf90Szs2XpJb4Sl+SGmLpS1JDLH1JaoilL0kNsfQlqSGWviQ1xNKXpIZY+pLUkHP6j7POhl/JIOlc5Ct9SWqIpS9JDbH0Jakhlr4kNcTSl6SGWPqS1BBP2VwCT9+UdK7wlb4kNcTSl6SGWPqS1BBLX5Ia4ge5Z8EPeCWtNJb+MvOJQdIksfTPUH+JS9JKY+mPSf+Txxc2XjDGJJJaMvLST7IR+E/AecAfV9X2UWdYDstxGMdDQ5KGbaSln+Q84HPArwMvAU8k2VVVz44yR4t8ApEEo3+lfx1wqKpeBEgyA2wCzqnSX+i4/0LjB2aPs6Wbd6aFPHeby1HoJ+/jI1e9xvQQtgM+8Ujjkqoa3Z0l/xrYWFUf7G7fAVxfVXf1LbMV2Nrd/EXg+QHv7hLgb88i7iitlKzmHL6VktWcw7ecWX++qt4y34yJ+yC3qnYAO852O0merKqpIURadislqzmHb6VkNefwjSvrqP8idxa4vO/2Zd2YJGkERl36TwDrk1yR5I3ArcCuEWeQpGaN9PBOVb2W5C7gEXqnbO6sqoPLdHdnfYhohFZKVnMO30rJas7hG0vWkX6QK0kaL79lU5IaYulLUkPOydJPsjHJ80kOJdk27jwnJbk8yaNJnk1yMMmHuvF7kswm2d9dbhl3VoAkh5Mc6DI92Y1dnGR3khe664vGnPEX+/bb/iSvJvnwJOzTJDuTHEvyTN/YvPsvPZ/tfmefTnLNBGT9j0m+0+X5WpLV3fi6JD/u27d/NOacC/6sk3ys26fPJ7lpzDn/rC/j4ST7u/HR7s+qOqcu9D4g/mvgrcAbgW8DV447V5ftUuCabvrNwF8BVwL3AP923PnmyXsYuGTO2H8AtnXT24DfH3fOOT/7vwF+fhL2KfBrwDXAM4vtP+AW4BtAgBuAv5yArDcC53fTv9+XdV3/chOQc96fdfdv69vAm4Arul44b1w558z/NPDvx7E/z8VX+qe+6qGq/g9w8qsexq6qjlTVU930D4DngLXjTXXGNgH3d9P3A+8aX5Sf8Q7gr6vqe+MOAlBVjwEvzxleaP9tAr5YPY8Dq5NcOpKgzJ+1qv68ql7rbj5O7+9qxmqBfbqQTcBMVf2kqr4LHKLXD8vudDmTBHgP8OVRZJnrXCz9tcD/7rv9EhNYrEnWAb8M/GU3dFf3NnrnuA+Z9Cngz5Ps674eA2BNVR3ppv8GWDOeaPO6ldf/Q5rEfbrQ/pv039v303snctIVSf5nkv+R5FfHFarPfD/rSd2nvwocraoX+sZGtj/PxdKfeElWAV8BPlxVrwL3Av8EuBo4Qu+t3yR4e1VdA9wM3Jnk1/pnVu+96USc89v9sd9vAP+tG5rUfXrKJO2/00nyceA14Evd0BHgH1fVLwO/Bfxpkr8/rnysgJ/1HLfx+hcnI92f52LpT/RXPSR5A73C/1JVfRWgqo5W1U+r6v8Bn2dEb0EXU1Wz3fUx4Gv0ch09edihuz42voSvczPwVFUdhcndpyy8/yby9zbJFuCfA7d3T1J0h0u+303vo3es/BfGlfE0P+uJ26dJzgf+JfBnJ8dGvT/PxdKf2K966I7l3Qc8V1Wf6RvvP3b7buCZueuOWpILkrz55DS9D/WeobcvN3eLbQa+Pp6EP+N1r54mcZ92Ftp/u4D3dmfx3AAc7zsMNBbp/YdH/w74jar6Ud/4W9L7vzFI8lZgPfDieFKe9me9C7g1yZuSXEEv57dGnW+OfwZ8p6peOjkw8v05qk+MR3mhdybEX9F7xvz4uPP05Xo7vbfzTwP7u8stwH8FDnTju4BLJyDrW+md+fBt4ODJ/Qj8A2AP8ALwF8DFE5D1AuD7wIV9Y2Pfp/SehI4A/5fe8eQPLLT/6J2187nud/YAMDUBWQ/ROyZ+8nf1j7pl/1X3O7EfeAr4F2POueDPGvh4t0+fB24eZ85u/AvAv5mz7Ej3p1/DIEkNORcP70iSFmDpS1JDLH1JaoilL0kNsfQlqSGWviQ1xNKXpIb8f1JfOTuBGZthAAAAAElFTkSuQmCC\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "df_contracts.annual_salary.hist(bins=100)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD4CAYAAADo30HgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVq0lEQVR4nO3df5Bd5X3f8fe3yGDCupKAdEslTYUbjTMENQnsAKlTzypKsQCPRTsOg4exJYeMxlNISa1MkOtpyKT1VG5KPHaauqNGjOWG8eIQuyj8iK3IbD3+Q8SIYsQP2yxErrUji9pgOWtIHaXf/nGfde6z7K97z713d+H9mtnZc57znHO+99yz96Pz4x5FZiJJ0rS/s9QFSJKWF4NBklQxGCRJFYNBklQxGCRJlVVLXcB8Lrzwwty4cWNH8/zgBz/gvPPO609BPWB9zVhfM9bXzEqp7+jRo9/JzB/vekGZuWx/Lr/88uzUww8/3PE8g2R9zVhfM9bXzEqpD3g0G3z2eipJklQxGCRJFYNBklQxGCRJFYNBklQxGCRJFYNBklQxGCRJFYNBklRZ1o/EeK3ZuOcBdm8+w849Dyyq//G91/W5Ikl6NY8YJEkVg0GSVDEYJEkVg0GSVDEYJEkVg0GSVDEYJEkVg0GSVDEYJEkVg0GSVFkwGCLiroh4ISKebGv7nYj4WkQ8ERGfi4g1bdM+GBETEfH1iHh7W/u20jYREXt6/kokST2xmCOGTwLbZrQdAi7NzH8MfAP4IEBEXALcCPxUmee/RMRZEXEW8PvANcAlwLtLX0nSMrNgMGTml4AXZ7R9ITPPlNEjwPoyvB0Yy8z/m5l/AUwAV5Sficx8PjN/CIyVvpKkZSYyc+FOERuB+zPz0lmm/QlwT2b+YUT8Z+BIZv5hmbYfeKh03ZaZv1La3wNcmZm3zrK8XcAugOHh4cvHxsY6ekFTU1MMDQ11NM+gHJs8zfC5cOqVxfXfvG51fwuaxXLefmB9TVlfMyulvi1bthzNzJFul9PosdsR8SHgDHB3k+W0y8x9wD6AkZGRHB0d7Wj+8fFxOp1nUHaWx27feWxxm/34TaP9LWgWy3n7gfU1ZX3NvF7q6zoYImIn8A5ga/7tYccksKGt2/rSxjztkqRlpKvbVSNiG/AbwDsz8+W2SQeBGyPinIi4GNgE/DnwFWBTRFwcEWfTukB9sFnpkqR+WPCIISI+DYwCF0bECeAOWnchnQMcighoXVd4f2Y+FRGfAZ6mdYrplsz8m7KcW4HPA2cBd2XmU314PZKkhhYMhsx89yzN++fp/2Hgw7O0Pwg82FF1kqSB85vPkqRKo7uStPxs3PNAR/2P772uT5VIWqk8YpAkVQwGSVLFYJAkVQwGSVLFYJAkVQwGSVLFYJAkVQwGSVLFYJAkVQwGSVLFYJAkVQwGSVLFYJAkVQwGSVLFYJAkVQwGSVLFYJAkVQwGSVLFYJAkVQwGSVLFYJAkVRYMhoi4KyJeiIgn29rOj4hDEfFs+b22tEdEfDwiJiLiiYi4rG2eHaX/sxGxoz8vR5LU1GKOGD4JbJvRtgc4nJmbgMNlHOAaYFP52QV8AlpBAtwBXAlcAdwxHSaSpOVlwWDIzC8BL85o3g4cKMMHgOvb2j+VLUeANRFxEfB24FBmvpiZLwGHeHXYSJKWgcjMhTtFbATuz8xLy/j3MnNNGQ7gpcxcExH3A3sz88tl2mHgdmAUeGNm/vvS/m+BVzLzP82yrl20jjYYHh6+fGxsrKMXNDU1xdDQUEfzDMqxydMMnwunXllc/83rVne1jk7MXMdy3n5gfU1ZXzMrpb4tW7YczcyRbpezqmkhmZkRsXC6LH55+4B9ACMjIzk6OtrR/OPj43Q6z6Ds3PMAuzef4c5ji9vsx28a7WodnZi5juW8/cD6mrK+Zl4v9XV7V9KpcoqI8vuF0j4JbGjrt760zdUuSVpmug2Gg8D0nUU7gPva2t9b7k66CjidmSeBzwNXR8TactH56tImSVpmFjynERGfpnWN4MKIOEHr7qK9wGci4mbgm8ANpfuDwLXABPAy8D6AzHwxIv4d8JXS77czc+YFbUnSMrBgMGTmu+eYtHWWvgncMsdy7gLu6qg6SdLA+c1nSVKl8V1JWtk2zriLaffmM/Pe2XR873X9LknSEvOIQZJUMRgkSRWDQZJU8RpDAzPPz0vSa4FHDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkSqNgiIh/HRFPRcSTEfHpiHhjRFwcEY9ExERE3BMRZ5e+55TxiTJ9Y09egSSpp7oOhohYB/wrYCQzLwXOAm4EPgJ8NDN/AngJuLnMcjPwUmn/aOknSVpmmp5KWgWcGxGrgB8DTgK/ANxbph8Ari/D28s4ZfrWiIiG65ck9VhkZvczR9wGfBh4BfgCcBtwpBwVEBEbgIcy89KIeBLYlpknyrTngCsz8zszlrkL2AUwPDx8+djYWEc1TU1NMTQ01PVr6sSxydMdzzN8Lpx6ZXF9N69b3fHyu6mp3UL1dVNTLw3y/e2G9TVjfc1M17dly5ajmTnS7XJWdTtjRKyldRRwMfA94I+Abd0ub1pm7gP2AYyMjOTo6GhH84+Pj9PpPN3aueeBjufZvfkMdx5b3GY/ftNox8vvpqZ2C9XXTU29NMj3txvW14z1NdOr+pqcSvpF4C8y8/9k5l8DnwXeCqwpp5YA1gOTZXgS2ABQpq8Gvttg/ZKkPmgSDP8buCoifqxcK9gKPA08DLyr9NkB3FeGD5ZxyvQvZpPzWJKkvug6GDLzEVoXkR8DjpVl7QNuBz4QERPABcD+Mst+4ILS/gFgT4O6JUl90vU1BoDMvAO4Y0bz88AVs/T9K+CXmqxPS29jh9cwju+9rk+VSOoXv/ksSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkisEgSaoYDJKkSqOH6Km/On1gnST1gkcMkqSKwSBJqhgMkqSKwSBJqhgMkqSKwSBJqhgMkqSKwSBJqhgMkqSKwSBJqjQKhohYExH3RsTXIuKZiPi5iDg/Ig5FxLPl99rSNyLi4xExERFPRMRlvXkJkqReanrE8DHgTzPzJ4GfBp4B9gCHM3MTcLiMA1wDbCo/u4BPNFy3JKkPug6GiFgNvA3YD5CZP8zM7wHbgQOl2wHg+jK8HfhUthwB1kTERd2uX5LUH5GZ3c0Y8TPAPuBpWkcLR4HbgMnMXFP6BPBSZq6JiPuBvZn55TLtMHB7Zj46Y7m7aB1RMDw8fPnY2FhHdU1NTTE0NNTVa+rUscnTHc8zfC6ceqUPxfRIr+vbvG517xbGYN/fblhfM9bXzHR9W7ZsOZqZI90up8ljt1cBlwG/mpmPRMTH+NvTRgBkZkZER8mTmftoBQ4jIyM5OjraUVHj4+N0Ok+3dnbxWOzdm89w57Hl+7TzXtd3/KbRni0LBvv+dsP6mrG+ZnpVX5NrDCeAE5n5SBm/l1ZQnJo+RVR+v1CmTwIb2uZfX9okSctI18GQmd8GvhURbylNW2mdVjoI7ChtO4D7yvBB4L3l7qSrgNOZebLb9UuS+qPpOYNfBe6OiLOB54H30Qqbz0TEzcA3gRtK3weBa4EJ4OXSV5K0zDQKhsx8HJjtAsfWWfomcEuT9UmS+s9vPkuSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKgaDJKliMEiSKo2DISLOioj/FRH3l/GLI+KRiJiIiHsi4uzSfk4ZnyjTNzZdtySp93pxxHAb8Ezb+EeAj2bmTwAvATeX9puBl0r7R0s/SdIy0ygYImI9cB3wB2U8gF8A7i1dDgDXl+HtZZwyfWvpL0laRiIzu5854l7gPwBvAn4d2AkcKUcFRMQG4KHMvDQingS2ZeaJMu054MrM/M6MZe4CdgEMDw9fPjY21lFNU1NTDA0Ndf2aOnFs8nTH8wyfC6de6UMxPdLr+javW927hTHY97cb1teM9TUzXd+WLVuOZuZIt8tZ1e2MEfEO4IXMPBoRo90uZ6bM3AfsAxgZGcnR0c4WPT4+TqfzdGvnngc6nmf35jPceazrzd53va7v+E2jPVsWDPb97Yb1NWN9zfSqviafAG8F3hkR1wJvBP4u8DFgTUSsyswzwHpgsvSfBDYAJyJiFbAa+G6D9UuS+qDrawyZ+cHMXJ+ZG4EbgS9m5k3Aw8C7SrcdwH1l+GAZp0z/YjY5jyVJ6ot+fI/hduADETEBXADsL+37gQtK+weAPX1YtySpoZ6cTM7McWC8DD8PXDFLn78CfqkX65Mk9Y/ffJYkVQwGSVJl+d43qdeEjR3e0nt873V9qkTSYnnEIEmqGAySpIrBIEmqGAySpIrBIEmqGAySpIrBIEmqGAySpIrBIEmqGAySpIrBIEmq+KykNp0+10eSXos8YpAkVQwGSVLFYJAkVQwGSVLFYJAkVbwrSVpAJ3er7d58htH+lSINhMGgFa2bW4z970Ol+XkqSZJU6ToYImJDRDwcEU9HxFMRcVtpPz8iDkXEs+X32tIeEfHxiJiIiCci4rJevQhJUu80OWI4A+zOzEuAq4BbIuISYA9wODM3AYfLOMA1wKbyswv4RIN1S5L6pOtgyMyTmflYGf5L4BlgHbAdOFC6HQCuL8PbgU9lyxFgTURc1O36JUn90ZNrDBGxEfhZ4BFgODNPlknfBobL8DrgW22znShtkqRlJDKz2QIihoD/CXw4Mz8bEd/LzDVt01/KzLURcT+wNzO/XNoPA7dn5qMzlreL1qkmhoeHLx8bG+uonqmpKYaGhrp6LccmT3c1XyeGz4VTr/R9NV1b6vo2r1s97/SZ728379lC65ipk3UMnwt/7/zOlj9ITf4+BsH6mpmub8uWLUczc6Tb5TS6XTUi3gD8MXB3Zn62NJ+KiIsy82Q5VfRCaZ8ENrTNvr60VTJzH7APYGRkJEdHRzuqaXx8nE7nmbZzAE9X3b35DHceW753CS91fcdvGp13+sz3t5v3bKF1zNTJOnZvPsMNXe5/g9Dk72MQrK+ZXtXX5K6kAPYDz2Tm77ZNOgjsKMM7gPva2t9b7k66CjjddspJkrRMNPmn4VuB9wDHIuLx0vZvgL3AZyLiZuCbwA1l2oPAtcAE8DLwvgbrliT1SdfBUK4VxByTt87SP4Fbul2fJGkwlu/Jbr0uLfSIi92bzwzkWpD0euYjMSRJFYNBklQxGCRJFYNBklQxGCRJFYNBklQxGCRJFb/HIK0wnf53pv5XpuqURwySpIrBIEmqGAySpIrBIEmqePFZrzudXryVXm88YpAkVQwGSVLFYJAkVbzGIGng5rrOM9d/xOSX9AbLIwZJUsVgkCRVDAZJUsVrDNJrXPv5/LnO4c/U6Tn9fn83xAcHDtZrOhj8IpMkde41HQzSSuA/YLTcDDwYImIb8DHgLOAPMnPvoGuQND/DamGv5dNbAw2GiDgL+H3gnwEngK9ExMHMfHqQdUj95Ifq0uvmPVhJH9z9NugjhiuAicx8HiAixoDtgMEgaUktJkwWe/G+2+W3W8qgiswc3Moi3gVsy8xfKePvAa7MzFvb+uwCdpXRtwBf73A1FwLf6UG5/WJ9zVhfM9bXzEqp7x9m5o93u5Bld/E5M/cB+7qdPyIezcyRHpbUU9bXjPU1Y33NvF7qG/QX3CaBDW3j60ubJGmZGHQwfAXYFBEXR8TZwI3AwQHXIEmax0BPJWXmmYi4Ffg8rdtV78rMp3q8mq5PQw2I9TVjfc1YXzOvi/oGevFZkrT8+RA9SVLFYJAkVVZsMETEtoj4ekRMRMSeWaafExH3lOmPRMTGAda2ISIejoinI+KpiLhtlj6jEXE6Ih4vP785qPrK+o9HxLGy7kdnmR4R8fGy/Z6IiMsGWNtb2rbL4xHx/Yj4tRl9Brr9IuKuiHghIp5sazs/Ig5FxLPl99o55t1R+jwbETsGWN/vRMTXyvv3uYhYM8e88+4LfazvtyJisu09vHaOeef9W+9jffe01XY8Ih6fY95BbL9ZP1P6tg9m5or7oXXh+jngzcDZwFeBS2b0+ZfAfy3DNwL3DLC+i4DLyvCbgG/MUt8ocP8SbsPjwIXzTL8WeAgI4CrgkSV8r79N6ws7S7b9gLcBlwFPtrX9R2BPGd4DfGSW+c4Hni+/15bhtQOq72pgVRn+yGz1LWZf6GN9vwX8+iLe/3n/1vtV34zpdwK/uYTbb9bPlH7tgyv1iOFHj9bIzB8C04/WaLcdOFCG7wW2RkQMorjMPJmZj5XhvwSeAdYNYt09tB34VLYcAdZExEVLUMdW4LnM/OYSrPtHMvNLwIszmtv3sQPA9bPM+nbgUGa+mJkvAYeAbYOoLzO/kJlnyugRWt8bWhJzbL/FWMzfemPz1Vc+N24APt3r9S7WPJ8pfdkHV2owrAO+1TZ+gld/8P6oT/njOA1cMJDq2pRTWD8LPDLL5J+LiK9GxEMR8VODrYwEvhARR6P1GJKZFrONB+FG5v6DXMrtBzCcmSfL8LeB4Vn6LJft+Mu0jgBns9C+0E+3llNdd81xGmQ5bL9/CpzKzGfnmD7Q7TfjM6Uv++BKDYYVISKGgD8Gfi0zvz9j8mO0To/8NPB7wP8YcHk/n5mXAdcAt0TE2wa8/gVF60uQ7wT+aJbJS739Ktk6Zl+W935HxIeAM8Ddc3RZqn3hE8A/An4GOEnrdM1y9G7mP1oY2Pab7zOll/vgSg2GxTxa40d9ImIVsBr47kCqa63zDbTewLsz87Mzp2fm9zNzqgw/CLwhIi4cVH2ZOVl+vwB8jtYhe7vl8PiSa4DHMvPUzAlLvf2KU9On18rvF2bps6TbMSJ2Au8AbiofHK+yiH2hLzLzVGb+TWb+P+C/zbHepd5+q4B/AdwzV59Bbb85PlP6sg+u1GBYzKM1DgLTV9/fBXxxrj+MXivnJPcDz2Tm787R5+9PX/OIiCtovRcDCa6IOC8i3jQ9TOsi5ZMzuh0E3hstVwGn2w5ZB2XOf6kt5fZr076P7QDum6XP54GrI2JtOVVydWnru2j9p1i/AbwzM1+eo89i9oV+1dd+zeqfz7HepX6Mzi8CX8vME7NNHNT2m+czpT/7YD+vpPfzh9ZdM9+gdcfCh0rbb9P6IwB4I61TEBPAnwNvHmBtP0/rkO4J4PHycy3wfuD9pc+twFO07rI4AvyTAdb35rLer5Yaprdfe31B6z9Veg44BowM+P09j9YH/eq2tiXbfrQC6iTw17TO0d5M65rVYeBZ4M+A80vfEVr/O+H0vL9c9sMJ4H0DrG+C1rnl6X1w+i69fwA8ON++MKD6/nvZt56g9QF30cz6yvir/tYHUV9p/+T0PtfWdym231yfKX3ZB30khiSpslJPJUmS+sRgkCRVDAZJUsVgkCRVDAZJUsVgkCRVDAZJUuX/A7zU+O5qfyxrAAAAAElFTkSuQmCC\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "df_contracts[df_contracts.annual_salary < 20].annual_salary.hist(bins=25)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAATlElEQVR4nO3df6zd9X3f8edrEEjKzTCM1iKGzVRyKhG8ZuEWkNpu14kKhvwBUasIygDnh1xNMLUa3eK0q8JKM3lVaKb8KJojrJKF5Yq1SbDAKXKseizVWMCMYgxhOMHJuEP2UlNTJyyt0/f+OF9bp+xe3+vre+45Pp/nQzo63/P5fs73+32f7zmv8z3f7/d+b6oKSVIb/s6wF0CStHwMfUlqiKEvSQ0x9CWpIYa+JDXkzGEvwIlccMEFtXr16oFN//vf/z7nnHPOwKY/bONeH4x/jeNeH4x/jcOob/fu3d+rqh+fbdxIh/7q1at58sknBzb9Xbt2MTU1NbDpD9u41wfjX+O41wfjX+Mw6kvynbnGuXtHkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaMtJ/kXu6Wr3pkaHNe//m9w5t3pJG37xb+kkuTvInSZ5LsjfJr3btdyWZSfJ0d7uu7zkfTbIvyQtJrulrX9+17UuyaTAlSZLmspAt/aPAnVX1VJK3AruT7OjGfbKqPtHfOcmlwI3AO4C3AV9L8vZu9GeBXwBeBp5Isq2qnluKQiRJ85s39KvqFeCVbvgvkzwPrDrBU64Hpqvqh8BLSfYBV3Tj9lXVtwGSTHd9DX1JWiY5mX+MnmQ18BhwGfAvgA3Aa8CT9H4NvJrkM8DjVfWF7jn3AV/tJrG+qj7ctd8CXFlVd7xhHhuBjQArV668fHp6etHFzefIkSNMTEws+XT3zBxe8mku1NpV5x4fHlR9o2Tcaxz3+mD8axxGfevWrdtdVZOzjVvwgdwkE8AfAb9WVa8luRe4G6ju/h7gg6e6sFW1BdgCMDk5WYO8JOmgLnm6YZgHcm+eOj487peshfGvcdzrg/GvcdTqW1DoJ3kTvcB/oKq+BFBVB/rGfw54uHs4A1zc9/SLujZO0C5JWgYLOXsnwH3A81X1e33tF/Z1ex/wbDe8DbgxydlJLgHWAN8AngDWJLkkyVn0DvZuW5oyJEkLsZAt/Z8FbgH2JHm6a/sN4KYk76S3e2c/8CsAVbU3yYP0DtAeBW6vqh8BJLkDeBQ4A9haVXuXrBJJ0rwWcvbO14HMMmr7CZ7zceDjs7RvP9HzJEmD5WUYJKkhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ2ZN/STXJzkT5I8l2Rvkl/t2s9PsiPJi939eV17knwqyb4kzyR5V9+0buv6v5jktsGVJUmazZkL6HMUuLOqnkryVmB3kh3ABmBnVW1OsgnYBHwEuBZY092uBO4FrkxyPvAxYBKobjrbqurVpS5K7dgzc5gNmx4Zyrz3b37vUOYrnYp5t/Sr6pWqeqob/kvgeWAVcD1wf9ftfuCGbvh64PPV8ziwIsmFwDXAjqo61AX9DmD9UhYjSTqxVNXCOyergceAy4DvVtWKrj3Aq1W1IsnDwOaq+no3bie9XwBTwJur6ne69t8CXq+qT7xhHhuBjQArV668fHp6+lTqO6EjR44wMTGx5NPdM3N4yae5UGtXnXt8eFD1jZKDhw5z4PXhzLv/tR6UFtbhuNc4jPrWrVu3u6omZxu3kN07ACSZAP4I+LWqeq2X8z1VVUkW/u1xAlW1BdgCMDk5WVNTU0sx2Vnt2rWLQUx/WLsbAPbfPHV8eFD1jZJPP/AQ9+xZ8Nt4SfW/1oPSwjoc9xpHrb4Fnb2T5E30Av+BqvpS13yg221Dd3+wa58BLu57+kVd21ztkqRlspCzdwLcBzxfVb/XN2obcOwMnNuAh/rab+3O4rkKOFxVrwCPAlcnOa870+fqrk2StEwW8rv4Z4FbgD1Jnu7afgPYDDyY5EPAd4D3d+O2A9cB+4AfAB8AqKpDSe4Gnuj6/XZVHVqKIiRJCzNv6HcHZDPH6PfM0r+A2+eY1lZg68ksoCRp6fgXuZLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyHD++4SkRVs9zH/S4/8FPu25pS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSHzhn6SrUkOJnm2r+2uJDNJnu5u1/WN+2iSfUleSHJNX/v6rm1fkk1LX4okaT4L2dL/A2D9LO2frKp3drftAEkuBW4E3tE95/eTnJHkDOCzwLXApcBNXV9J0jKa9x+jV9VjSVYvcHrXA9NV9UPgpST7gCu6cfuq6tsASaa7vs+d/CJLkhYrVTV/p17oP1xVl3WP7wI2AK8BTwJ3VtWrST4DPF5VX+j63Qd8tZvM+qr6cNd+C3BlVd0xy7w2AhsBVq5cefn09PSp1HdCR44cYWJiYsmnu2fm8JJPc6HWrjr3+PCg6hslBw8d5sDrw5l3/2s9KLOtw1F5fy2VcX+fDqO+devW7a6qydnGzbulP4d7gbuB6u7vAT64yGn9LVW1BdgCMDk5WVNTU0sx2Vnt2rWLQUx/w6ZHlnyaC7X/5qnjw4Oqb5R8+oGHuGfPYt/Gp6b/tR6U2dbhqLy/lsq4v09Hrb5FfVqq6sCx4SSfAx7uHs4AF/d1vahr4wTtkqRlsqhTNpNc2PfwfcCxM3u2ATcmOTvJJcAa4BvAE8CaJJckOYvewd5ti19sSdJizLuln+SLwBRwQZKXgY8BU0neSW/3zn7gVwCqam+SB+kdoD0K3F5VP+qmcwfwKHAGsLWq9i51MZKkE1vI2Ts3zdJ83wn6fxz4+Czt24HtJ7V0kqQl5V/kSlJDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIfP+Y3RJOmb1pkeWfJp3rj3Khnmmu3/ze5d8vq1yS1+SGjLWW/rzbZUsZAtDksaJW/qS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktSQeUM/ydYkB5M829d2fpIdSV7s7s/r2pPkU0n2JXkmybv6nnNb1//FJLcNphxJ0oksZEv/D4D1b2jbBOysqjXAzu4xwLXAmu62EbgXel8SwMeAK4ErgI8d+6KQJC2feUO/qh4DDr2h+Xrg/m74fuCGvvbPV8/jwIokFwLXADuq6lBVvQrs4P//IpEkDViqav5OyWrg4aq6rHv8F1W1ohsO8GpVrUjyMLC5qr7ejdsJfASYAt5cVb/Ttf8W8HpVfWKWeW2k9yuBlStXXj49Pb3o4vbMHD7h+JVvgQOvL3ryI2ntqnOPDx85coSJiYkhLs3gHTx0eGjrsP+1HpTZ1uF87+vTzUI+h8vxWg/KMD6H69at211Vk7ONO+WrbFZVJZn/m2Ph09sCbAGYnJysqampRU9rvito3rn2KPfsGa8Lje6/eer48K5duziV1+908OkHHhraOux/rQdltnU4bleGXcjncDle60EZtc/hYs/eOdDttqG7P9i1zwAX9/W7qGubq12StIwWG/rbgGNn4NwGPNTXfmt3Fs9VwOGqegV4FLg6yXndAdyruzZJ0jKa93dxki/S2yd/QZKX6Z2Fsxl4MMmHgO8A7++6bweuA/YBPwA+AFBVh5LcDTzR9fvtqnrjwWFJ0oDNG/pVddMco94zS98Cbp9jOluBrSe1dJKkJeVf5EpSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIacOewFkKT5rN70yNDmvX/ze4c270FwS1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUkFMK/ST7k+xJ8nSSJ7u285PsSPJid39e154kn0qyL8kzSd61FAVIkhZuKbb011XVO6tqsnu8CdhZVWuAnd1jgGuBNd1tI3DvEsxbknQSBrF753rg/m74fuCGvvbPV8/jwIokFw5g/pKkOaSqFv/k5CXgVaCA/1BVW5L8RVWt6MYHeLWqViR5GNhcVV/vxu0EPlJVT75hmhvp/RJg5cqVl09PTy96+fbMHD7h+JVvgQOvL3ryI2ntqnOPDx85coSJiYkhLs3gHTx0eGjrsP+1HpTZ1uF87+vTzah/Dk91PQ/jc7hu3brdfXtf/pZTveDaz1XVTJKfAHYk+Wb/yKqqJCf1rVJVW4AtAJOTkzU1NbXohdswz0Wa7lx7lHv2jNc15/bfPHV8eNeuXZzK63c6+PQDDw1tHfa/1oMy2zqc7319uhn1z+GprudR+xye0u6dqprp7g8CXwauAA4c223T3R/sus8AF/c9/aKuTZK0TBYd+knOSfLWY8PA1cCzwDbgtq7bbcBD3fA24NbuLJ6rgMNV9cqil1ySdNJO5TfVSuDLvd32nAn8p6r64yRPAA8m+RDwHeD9Xf/twHXAPuAHwAdOYd6SpEVYdOhX1beBn56l/c+B98zSXsDti52fJOnU+Re5ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyKL/MbrUutWbHhn4PO5ce5QNyzAftcMtfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSHLHvpJ1id5Icm+JJuWe/6S1LJlDf0kZwCfBa4FLgVuSnLpci6DJLVsubf0rwD2VdW3q+qvgGng+mVeBklqVqpq+WaW/BKwvqo+3D2+Bbiyqu7o67MR2Ng9/CnghQEu0gXA9wY4/WEb9/pg/Gsc9/pg/GscRn3/oKp+fLYRI3c9/araAmxZjnklebKqJpdjXsMw7vXB+Nc47vXB+Nc4avUt9+6dGeDivscXdW2SpGWw3KH/BLAmySVJzgJuBLYt8zJIUrOWdfdOVR1NcgfwKHAGsLWq9i7nMrzBsuxGGqJxrw/Gv8Zxrw/Gv8aRqm9ZD+RKkobLv8iVpIYY+pLUkLEL/SRbkxxM8mxf211JZpI83d2u69rflOT+JHuSPJ/ko33PGcnLRcxWX9f+z5N8M8neJL/b1/7RroYXklzT137a15fkF5Ls7tbf7iTv7ut/ede+L8mnkmS5a5nLya7DbtzfT3Ikya/3tZ3267Br/4dJ/lvXvifJm7v2kVyHJ/keHb2MqaqxugH/GHgX8Gxf213Ar8/S95eB6W74x4D9wGp6B5m/BfwkcBbwZ8Clw67tBPWtA74GnN09/onu/tJu2c8GLulqOmOM6vtHwNu64cuAmb7nfAO4CgjwVeDaYde2mBr7xv8h8J+PvY/HaB2eCTwD/HT3+O8BZ4zyOjzJ+kYuY8ZuS7+qHgMOLbQ7cE6SM4G3AH8FvMYIXy5ijvr+GbC5qn7Y9TnYtV9P7w33w6p6CdhHr7axqK+q/kdV/e+uz17gLUnOTnIh8Her6vHqfdo+D9ywLAUswEmuQ5LcALxEr8ZjxmIdAlcDz1TVn3Xtf15VPxrldXiS9Y1cxoxd6J/AHUme6X6ande1/SHwfeAV4LvAJ6rqELAK+F99z325axtVbwd+Psl/T/JfkvxM1z5XHeNSX79fBJ7qPnSr6NV0zKjXB3PUmGQC+Ajwb97Qf1zW4duBSvJokqeS/Kuu/XRbh3PVN3IZM3KXYRiQe4G76X3r3g3cA3yQ3rftj4C3AecB/zXJ14a1kKfgTOB8ej+FfwZ4MMlPDneRltSs9XVbgCR5B/Dv6G01nq7mWod3AZ+sqiMjskt7seaq70zg57q2HwA7k+wGDg9rQRdprvpGLmOaCP2qOnBsOMnngIe7h78M/HFV/TVwMMmfApP0voFPp8tFvAx8qQvBbyT5G3oXeTrRZS/Gob7/k+Qi4MvArVX1ra7/DL2ajhn1+mDuGq8Efqk7MLgC+Jsk/xfYzXisw5eBx6rqewBJttPbX/4FTq91OFd9I5cxTeze6fYPHvM+4NhR9+8C7+76nEPvW/qbnH6Xi/gKvQNJJHk7vQND36O3zDd2+7kvAdbQOzg2FvUlWQE8Amyqqj891rmqXgFeS3JVd8bHrcBDy73QJ+krzFJjVf18Va2uqtXAvwf+bVV9hjFZh/T+On9tkh/r9nv/E+C503AdfoXZ6xu9jBnmUfBB3IAv0tt/9tf0vn0/BPxHYA+9swS2ARd2fSfonRGxF3gO+Jd907kO+J/0jrD/5rDrmqe+s+htGT0LPAW8u6//b3Y1vEDf2Q/jUB/wr+ntL32673bsrInJrv+3gM/Q/fX5KNxOdh32Pe8u+s5CG4d12PX/p91n8Fngd/vaR3IdnuR7dOQyxsswSFJDmti9I0nqMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQ/4fkkQcB3xhV/gAAAAASUVORK5CYII=\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "df_contracts.startY.hist(bins=10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.026105873821609893"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# proportion of female apprentices\n",
- "1-(df_contracts.a_gender.sum()/df_contracts.shape[0])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.023723194861701047"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# proportion of female masters\n",
- "1-(df_contracts.m_gender.sum()/df_contracts.shape[0])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.7310924369747899"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# prop female apprentices with male master\n",
- "df_contracts[(df_contracts.a_gender == 0) & (df_contracts.startY < 1800)].m_gender.sum()\\\n",
- " /df_contracts[(df_contracts.a_gender == 0) & (df_contracts.startY < 1800)].shape[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.9810528582193992"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# prop male apprentices with male master\n",
- "df_contracts[(df_contracts.a_gender == 1) & (df_contracts.startY < 1800)].m_gender.sum()\\\n",
- " /df_contracts[(df_contracts.a_gender == 1) & (df_contracts.startY < 1800)].shape[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Looking at empirical distributions"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAT5ElEQVR4nO3df4xd5Z3f8fenJslGsFnIQkdem9REdVJB2HWXEaHa7WrYdBMD0UKqVYpFA07SdaIFKVFdrcx2JdJESLTdbLZRtqycYEG0KQ4Km2AF0qyXZkQqlQScUMyPUAwxwq5jKyELOySiNfn2j3umc2PGM3Pnju+Yed4v6WrOec45z3nuF/yZM+ece26qCklSG/7ecg9AkjQ6hr4kNcTQl6SGGPqS1BBDX5IacspyD2A+Z555Zq1bt27g7V588UVOPfXUpR/Qq4x16LEOM6xFz0quw549e35YVWfNtuykD/1169bx4IMPDrzd5OQkExMTSz+gVxnr0GMdZliLnpVchyTPHG+Zp3ckqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0JakhJ/0ncpfLum13z7l8/02XjWgkkrR0PNKXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDZk39JPsSHIkySN9bV9M8lD32p/koa59XZKf9i37i75tLkiyN8m+JJ9OkhPyjiRJx7WQ+/RvBT4DfH66oar+xfR0kk8Cz/et/1RVbZiln5uB3we+BdwDbAS+NvCIJUmLNu+RflXdBzw327LuaP29wO1z9ZFkNfCGqrq/qoreL5ArBh6tJGkow34i958Ch6vqyb62c5J8F3gB+OOq+iawBjjQt86Brm1WSbYAWwDGxsaYnJwceGBTU1OL2m7a1vOPzrl8mL5Hadg6rBTWYYa16Gm1DsOG/iZ+/ij/EPCmqvpRkguAryQ5b9BOq2o7sB1gfHy8FvPlxcN+6fHm+R7DcNXi+x6llfzlz4OwDjOsRU+rdVh06Cc5BfjnwAXTbVX1EvBSN70nyVPAW4CDwNq+zdd2bZKkERrmls1/Bnyvqv7/aZskZyVZ1U2/GVgPPF1Vh4AXklzUXQe4GrhriH1LkhZhIbds3g78D+CtSQ4k+WC36EpeeQH3t4CHu1s4vwR8uKqmLwL/AfA5YB/wFN65I0kjN+/pnaradJz2zbO03QnceZz1HwTeNuD4JElLyE/kSlJDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ1ZyBej70hyJMkjfW0fS3IwyUPd69K+Zdcn2ZfkiSTv6mvf2LXtS7Jt6d+KJGk+CznSvxXYOEv7p6pqQ/e6ByDJucCVwHndNv85yaokq4A/By4BzgU2detKkkbolPlWqKr7kqxbYH+XAzur6iXg+0n2ARd2y/ZV1dMASXZ26z42+JAlSYs1b+jP4bokVwMPAlur6sfAGuD+vnUOdG0Azx7T/vbjdZxkC7AFYGxsjMnJyYEHNzU1tajtpm09/+icy4fpe5SGrcNKYR1mWIueVuuw2NC/GfgEUN3PTwIfWKpBVdV2YDvA+Ph4TUxMDNzH5OQki9lu2uZtd8+5fP9Vi+97lIatw0phHWZYi55W67Co0K+qw9PTST4LfLWbPQic3bfq2q6NOdolSSOyqFs2k6zum30PMH1nzy7gyiSvS3IOsB74NvAAsD7JOUleS+9i767FD1uStBjzHuknuR2YAM5McgC4AZhIsoHe6Z39wIcAqurRJHfQu0B7FLi2ql7u+rkO+DqwCthRVY8u9ZuRJM1tIXfvbJql+ZY51r8RuHGW9nuAewYanSRpSfmJXElqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDZn36xI1u3Xb7j7usv03XTbCkUjSws17pJ9kR5IjSR7pa/uPSb6X5OEkX05yete+LslPkzzUvf6ib5sLkuxNsi/Jp5PkhLwjSdJxLeT0zq3AxmPadgNvq6pfBf4XcH3fsqeqakP3+nBf+83A7wPru9exfUqSTrB5Q7+q7gOeO6btr6vqaDd7P7B2rj6SrAbeUFX3V1UBnweuWNSIJUmLthTn9D8AfLFv/pwk3wVeAP64qr4JrAEO9K1zoGubVZItwBaAsbExJicnBx7U1NTUorabtvX8o/OvdBzD7HepDVuHlcI6zLAWPa3WYajQT/JvgaPAF7qmQ8CbqupHSS4AvpLkvEH7rartwHaA8fHxmpiYGHhsk5OTLGa7aZvnuFA7n/1XLX6/S23YOqwU1mGGtehptQ6LDv0km4F3A+/oTtlQVS8BL3XTe5I8BbwFOMjPnwJa27VJkkZoUffpJ9kI/CHwu1X1k772s5Ks6qbfTO+C7dNVdQh4IclF3V07VwN3DT16SdJA5j3ST3I7MAGcmeQAcAO9u3VeB+zu7ry8v7tT57eAjyf5v8DPgA9X1fRF4D+gdyfQ64GvdS9J0gjNG/pVtWmW5luOs+6dwJ3HWfYg8LaBRidJWlI+hkGSGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQ5r9jty5vuNWklYqj/QlqSGGviQ1pNnTOyfSfKeO9t902YhGIkk/zyN9SWqIoS9JDTH0Jakhhr4kNcTQl6SGLCj0k+xIciTJI31tb0yyO8mT3c8zuvYk+XSSfUkeTvLrfdtc063/ZJJrlv7tSJLmstAj/VuBjce0bQPurar1wL3dPMAlwPrutQW4GXq/JIAbgLcDFwI3TP+ikCSNxoJCv6ruA547pvly4LZu+jbgir72z1fP/cDpSVYD7wJ2V9VzVfVjYDev/EUiSTqBhvlw1lhVHeqmfwCMddNrgGf71jvQtR2v/RWSbKH3VwJjY2NMTk4OPLipqak5t9t6/tGB+1wqi3k/izVfHVphHWZYi55W67Akn8itqkpSS9FX1992YDvA+Ph4TUxMDNzH5OQkc223eTkfuLb3xeMuWupP685Xh1ZYhxnWoqfVOgxz987h7rQN3c8jXftB4Oy+9dZ2bcdrlySNyDChvwuYvgPnGuCuvvaru7t4LgKe704DfR14Z5Izugu47+zaJEkjsqDTO0luByaAM5McoHcXzk3AHUk+CDwDvLdb/R7gUmAf8BPg/QBV9VySTwAPdOt9vKqOvTgsSTqBFhT6VbXpOIveMcu6BVx7nH52ADsWPDpJ0pLyE7mS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQxYd+knemuShvtcLST6a5GNJDva1X9q3zfVJ9iV5Ism7luYtSJIWakFfjD6bqnoC2ACQZBVwEPgy8H7gU1X1J/3rJzkXuBI4D/gV4G+SvKWqXl7sGCRJg1mq0zvvAJ6qqmfmWOdyYGdVvVRV3wf2ARcu0f4lSQuwVKF/JXB73/x1SR5OsiPJGV3bGuDZvnUOdG2SpBFJVQ3XQfJa4H8D51XV4SRjwA+BAj4BrK6qDyT5DHB/Vf1lt90twNeq6kuz9LkF2AIwNjZ2wc6dOwce19TUFKeddtpxl+89+PzAfY7C+Wt+aUn7m68OrbAOM6xFz0quw8UXX7ynqsZnW7boc/p9LgG+U1WHAaZ/AiT5LPDVbvYgcHbfdmu7tleoqu3AdoDx8fGamJgYeFCTk5PMtd3mbXcP3Oco7L9qYkn7m68OrbAOM6xFT6t1WIrTO5voO7WTZHXfsvcAj3TTu4Ark7wuyTnAeuDbS7B/SdICDXWkn+RU4HeAD/U1/4ckG+id3tk/vayqHk1yB/AYcBS41jt3JGm0hgr9qnoR+OVj2t43x/o3AjcOs09J0uL5iVxJaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIUOHfpL9SfYmeSjJg13bG5PsTvJk9/OMrj1JPp1kX5KHk/z6sPuXJC3cUh3pX1xVG6pqvJvfBtxbVeuBe7t5gEuA9d1rC3DzEu1fkrQAJ+r0zuXAbd30bcAVfe2fr577gdOTrD5BY5AkHWMpQr+Av06yJ8mWrm2sqg510z8AxrrpNcCzfdse6NokSSNwyhL08ZtVdTDJ3wd2J/le/8KqqiQ1SIfdL48tAGNjY0xOTg48qKmpqTm323r+0YH7HIXFvNe5zFeHVliHGdaip9U6DB36VXWw+3kkyZeBC4HDSVZX1aHu9M2RbvWDwNl9m6/t2o7tczuwHWB8fLwmJiYGHtfk5CRzbbd5290D9zkK+6+aWNL+5qtDK6zDDGvR02odhjq9k+TUJL84PQ28E3gE2AVc0612DXBXN70LuLq7i+ci4Pm+00CSpBNs2CP9MeDLSab7+i9V9V+TPADckeSDwDPAe7v17wEuBfYBPwHeP+T+JUkDGCr0q+pp4Ndmaf8R8I5Z2gu4dph9SpIWz0/kSlJDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWrIUjxaWUto3TxP/9x/02UjGomklcgjfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JasiiQz/J2Um+keSxJI8m+UjX/rEkB5M81L0u7dvm+iT7kjyR5F1L8QYkSQs3zGMYjgJbq+o7SX4R2JNkd7fsU1X1J/0rJzkXuBI4D/gV4G+SvKWqXh5iDJKkASz6SL+qDlXVd7rpvwMeB9bMscnlwM6qeqmqvg/sAy5c7P4lSYNLVQ3fSbIOuA94G/Cvgc3AC8CD9P4a+HGSzwD3V9VfdtvcAnytqr40S39bgC0AY2NjF+zcuXPgMU1NTXHaaacdd/neg88P3OfJ4Pw1vzTn8mPf19jr4fBPF7btSjbf/w8tsRY9K7kOF1988Z6qGp9t2dBP2UxyGnAn8NGqeiHJzcAngOp+fhL4wCB9VtV2YDvA+Ph4TUxMDDyuyclJ5tpu8zxPszxZ7b9qYs7lx76vrecf5ZN7T1nQtivZfP8/tMRa9LRah6Hu3knyGnqB/4Wq+iuAqjpcVS9X1c+AzzJzCucgcHbf5mu7NknSiAxz906AW4DHq+pP+9pX9632HuCRbnoXcGWS1yU5B1gPfHux+5ckDW6Y0zu/AbwP2Jvkoa7tj4BNSTbQO72zH/gQQFU9muQO4DF6d/5c6507kjRaiw79qvrvQGZZdM8c29wI3LjYfUqShuMnciWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDhn4Mg1aOdXM8mmL/TZeNcCSSTpQVHfpzhdir1Up8T5JGx9M7ktQQQ1+SGmLoS1JDVvQ5fY3OfNcavBAsnRw80pekhnikrxVt+i+QrecfnfXb0vwLRK3xSF+SGmLoS1JDDH1Jaojn9KXG7D34/KzXN8BrHC0w9DUSLT7Xp8X3fLKa7b/F9MX91v5bjDz0k2wE/hOwCvhcVd006jG0apjn9iznM39O1s8A+BwkvRqNNPSTrAL+HPgd4ADwQJJdVfXYKMehk8vJGp4n67ikYYz6SP9CYF9VPQ2QZCdwOWDoa9FejX/BnMj9DvOXzzDjmm+/J2vfy2k5TgGmqk5Ix7PuLPk9YGNV/atu/n3A26vqumPW2wJs6WbfCjyxiN2dCfxwiOGuFNahxzrMsBY9K7kO/6CqzpptwUl5IbeqtgPbh+kjyYNVNb5EQ3rVsg491mGGtehptQ6jvk//IHB23/zark2SNAKjDv0HgPVJzknyWuBKYNeIxyBJzRrp6Z2qOprkOuDr9G7Z3FFVj56g3Q11emgFsQ491mGGtehpsg4jvZArSVpePntHkhpi6EtSQ1Zc6CfZmOSJJPuSbFvu8YxSkh1JjiR5pK/tjUl2J3my+3nGco5xFJKcneQbSR5L8miSj3TtTdUiyS8k+XaS/9nV4d917eck+Vb3b+SL3U0VTUiyKsl3k3y1m2+uFisq9Pse83AJcC6wKcm5yzuqkboV2HhM2zbg3qpaD9zbza90R4GtVXUucBFwbff/QWu1eAn47ar6NWADsDHJRcC/Bz5VVf8Q+DHwweUb4sh9BHi8b765Wqyo0KfvMQ9V9X+A6cc8NKGq7gOeO6b5cuC2bvo24IpRjmk5VNWhqvpON/139P6Rr6GxWlTPVDf7mu5VwG8DX+raV3wdpiVZC1wGfK6bDw3WYqWF/hrg2b75A11by8aq6lA3/QNgbDkHM2pJ1gH/GPgWDdaiO53xEHAE2A08BfxtVR3tVmnp38ifAX8I/Kyb/2UarMVKC33NoXr35zZzj26S04A7gY9W1Qv9y1qpRVW9XFUb6H36/ULgHy3viJZHkncDR6pqz3KPZbmdlM/eGYKPeXilw0lWV9WhJKvpHfGteEleQy/wv1BVf9U1N1kLgKr62yTfAP4JcHqSU7oj3Fb+jfwG8LtJLgV+AXgDve/1aK4WK+1I38c8vNIu4Jpu+hrgrmUcy0h052pvAR6vqj/tW9RULZKcleT0bvr19L7H4nHgG8Dvdaut+DoAVNX1VbW2qtbRy4X/VlVX0WAtVtwncrvf5H/GzGMeblzeEY1OktuBCXqPjD0M3AB8BbgDeBPwDPDeqjr2Yu+KkuQ3gW8Ce5k5f/tH9M7rN1OLJL9K7+LkKnoHeHdU1ceTvJneTQ5vBL4L/Muqemn5RjpaSSaAf1NV726xFisu9CVJx7fSTu9IkuZg6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SG/D/CpIrwm9HJbAAAAABJRU5ErkJggg==\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "df_contracts[df_contracts.annual_salary < 50].annual_salary.hist(bins=40)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAD4CAYAAAAEhuazAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAQoUlEQVR4nO3df4xlZX3H8fenoNWwhoVgJwRohzabNpRtKUyApqaZrSku8AeYGCKhuljN+gckmvIHWxMD9UeyadS2tpZ2LRsxVaekatkALd1snFD+QNm1yPKjlq0uLZPtbuzC6qixWf32j3u2nZ3O751f9z7vVzK55z7nuec+35ydzz3znHPPpqqQJLXlp9Z6AJKk1Wf4S1KDDH9JapDhL0kNMvwlqUFnr/UA5nLBBRfU8PDwaW3f//73Oeecc9ZmQCvIuvrPoNY2qHXB4NY2va4DBw58p6reONdr1nX4Dw8Ps3///tPaxsfHGR0dXZsBrSDr6j+DWtug1gWDW9v0upK8NN9rnPaRpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGretv+EorYXjHI4vqf3jnjSs0EmnteOQvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSg+YN/ySXJPlKkueTPJfkfV37+Un2Jnmxezyva0+STyY5lOSZJFdO2da2rv+LSbatXFmSpLks5Mj/JHBXVV0GXAvckeQyYAewr6o2Afu65wDXA5u6n+3AfdD7sADuAa4BrgbuOfWBIUlaXfOGf1Udqaqvd8vfA14ALgJuAh7ouj0A3Nwt3wR8tnqeBDYmuRB4C7C3qo5X1SvAXmDrchYjSVqYVNXCOyfDwOPA5cC/V9XGrj3AK1W1McnDwM6qeqJbtw+4GxgFXldVH+naPwj8sKo+Nu09ttP7i4GhoaGrxsbGThvD5OQkGzZsWHSh6511rZ6DEycW1X/zRefO2L4ea1sOg1oXDG5t0+vasmXLgaoames1C/4/fJNsAL4IvL+qvtvL+56qqiQL/xSZQ1XtAnYBjIyM1Ojo6Gnrx8fHmd42CKxr9dy+2P/D97bRGdvXY23LYVDrgsGtbSl1LehqnySvoRf8n6uqL3XNR7vpHLrHY137BHDJlJdf3LXN1i5JWmULudonwP3AC1X1iSmr9gCnrtjZBjw0pf2d3VU/1wInquoI8BhwXZLzuhO913VtkqRVtpBpn98A3gEcTPJ01/YBYCfwYJJ3Ay8Bt3TrHgVuAA4BPwDeBVBVx5N8GHiq6/ehqjq+HEVIkhZn3vDvTtxmltVvnqF/AXfMsq3dwO7FDFCStPz8hq8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JatC84Z9kd5JjSZ6d0nZvkokkT3c/N0xZ9/tJDiX5ZpK3TGnf2rUdSrJj+UuRJC3UQo78PwNsnaH9j6rqiu7nUYAklwFvB365e82fJzkryVnAp4DrgcuAW7u+kqQ1cPZ8Harq8STDC9zeTcBYVf0I+HaSQ8DV3bpDVfUtgCRjXd/nFz9kSdKZSlXN36kX/g9X1eXd83uB24HvAvuBu6rqlSR/BjxZVX/d9bsf+PtuM1ur6j1d+zuAa6rqzhneazuwHWBoaOiqsbGx09ZPTk6yYcOGRRe63lnX6jk4cWJR/TdfdO6M7euxtuUwqHXB4NY2va4tW7YcqKqRuV4z75H/LO4DPgxU9/hx4HeXuK3TVNUuYBfAyMhIjY6OnrZ+fHyc6W2DwLpWz+07HllU/8O3jc7Yvh5rWw6DWhcMbm1LqWtJ4V9VR08tJ/k08HD3dAK4ZErXi7s25miXJK2yJV3qmeTCKU/fCpy6EmgP8PYkP53kUmAT8DXgKWBTkkuTvJbeSeE9Sx+2JOlMzHvkn+QLwChwQZKXgXuA0SRX0Jv2OQy8F6CqnkvyIL0TuSeBO6rqx9127gQeA84CdlfVc8tdjCRpYRZytc+tMzTfP0f/jwIfnaH9UeDRRY1OkrQi/IavJDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNWhJ/4G7tFKGdzyyqP6Hd964QiORBptH/pLUIMNfkhrktI/62mKniST1eOQvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDZo3/JPsTnIsybNT2s5PsjfJi93jeV17knwyyaEkzyS5csprtnX9X0yybWXKkSQtxEKO/D8DbJ3WtgPYV1WbgH3dc4DrgU3dz3bgPuh9WAD3ANcAVwP3nPrAkCStvnnDv6oeB45Pa74JeKBbfgC4eUr7Z6vnSWBjkguBtwB7q+p4Vb0C7OX/f6BIklZJqmr+Tskw8HBVXd49f7WqNnbLAV6pqo1JHgZ2VtUT3bp9wN3AKPC6qvpI1/5B4IdV9bEZ3ms7vb8aGBoaumpsbOy09ZOTk2zYsGFJxa5n1tVzcOLECo5maTZfdO6M7e6z/jOotU2va8uWLQeqamSu15x9pm9aVZVk/k+QhW9vF7ALYGRkpEZHR09bPz4+zvS2QWBdPbfveGTlBrNEh28bnbHdfdZ/BrW2pdS11PA/muTCqjrSTesc69ongEum9Lu4a5ugd/Q/tX18ie8trarhWT6Q7tp8csYPq8M7b1zpIUlnbKmXeu4BTl2xsw14aEr7O7urfq4FTlTVEeAx4Lok53Uneq/r2iRJa2DeI/8kX6B31H5BkpfpXbWzE3gwybuBl4Bbuu6PAjcAh4AfAO8CqKrjST4MPNX1+1BVTT+JLElaJfOGf1XdOsuqN8/Qt4A7ZtnObmD3okYnSVoRZ3zCV9LpZjtHMBvPEWgteHsHSWqQ4S9JDTL8JalBzvlrRR2cOLEuv7gltc4jf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IadPZaD0DS4gzveGRR/Q/vvHGFRqJ+5pG/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ16IzCP8nhJAeTPJ1kf9d2fpK9SV7sHs/r2pPkk0kOJXkmyZXLUYAkafGW48h/S1VdUVUj3fMdwL6q2gTs654DXA9s6n62A/ctw3tLkpZgJaZ9bgIe6JYfAG6e0v7Z6nkS2JjkwhV4f0nSPFJVS39x8m3gFaCAv6yqXUleraqN3foAr1TVxiQPAzur6olu3T7g7qraP22b2+n9ZcDQ0NBVY2Njp73n5OQkGzZsWPKY16tBrevY8RMc/eFaj2JlDL2eZalt80XnLqr/wYkTK7r9Qf23CINb2/S6tmzZcmDKbMyMzvSunm+qqokkPwPsTfIvU1dWVSVZ1KdLVe0CdgGMjIzU6OjoaevHx8eZ3jYIBrWuP/3cQ3z84GDePPauzSeXpbbDt40uqv/ti72r5yK3P6j/FmFwa1tKXWc07VNVE93jMeDLwNXA0VPTOd3jsa77BHDJlJdf3LVJklbZksM/yTlJ3nBqGbgOeBbYA2zrum0DHuqW9wDv7K76uRY4UVVHljxySdKSncnfrEPAl3vT+pwNfL6q/iHJU8CDSd4NvATc0vV/FLgBOAT8AHjXGby3JOkMLDn8q+pbwK/O0P5fwJtnaC/gjqW+nyRp+fgNX0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktSgwfzevdRHhhd5u4aV3v5dm08yujJD0Trikb8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWrQ2Ws9APWX4R2PLKr/XZtXaCCSzohH/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDfJqn8Yt9uodSYPB8Jd0xhZ7EHF4540rNBItlOEvad3zw2X5rXr4J9kK/AlwFvBXVbVztccgaW5OBw6+VQ3/JGcBnwJ+G3gZeCrJnqp6fjXH0S/8BZS0Ulb7yP9q4FBVfQsgyRhwE2D4Sw1Z6QOb2bZ/1+aT3L5M773YqaX1NnWVqlrRNzjtzZK3AVur6j3d83cA11TVnVP6bAe2d09/EfjmtM1cAHxnFYa72qyr/wxqbYNaFwxubdPr+rmqeuNcL1h3J3yrahewa7b1SfZX1cgqDmlVWFf/GdTaBrUuGNzallLXan/JawK4ZMrzi7s2SdIqWu3wfwrYlOTSJK8F3g7sWeUxSFLzVnXap6pOJrkTeIzepZ67q+q5RW5m1imhPmdd/WdQaxvUumBwa1t0Xat6wleStD54YzdJapDhL0kN6pvwT7I1yTeTHEqyY63Hs5ySHE5yMMnTSfav9XiWKsnuJMeSPDul7fwke5O82D2et5ZjXKpZars3yUS3355OcsNajnEpklyS5CtJnk/yXJL3de19vd/mqKuv91mS1yX5WpJvdHX9Qdd+aZKvdvn4N90FNXNvqx/m/LvbQvwrU24LAdw6KLeFSHIYGKmqvv7ySZLfBCaBz1bV5V3bHwLHq2pn96F9XlXdvZbjXIpZarsXmKyqj63l2M5EkguBC6vq60neABwAbgZup4/32xx13UIf77MkAc6pqskkrwGeAN4H/B7wpaoaS/IXwDeq6r65ttUvR/7/e1uIqvpv4NRtIbSOVNXjwPFpzTcBD3TLD9D7Bew7s9TW96rqSFV9vVv+HvACcBF9vt/mqKuvVc9k9/Q13U8BvwX8bde+oP3VL+F/EfAfU56/zADsyCkK+MckB7rbWwySoao60i3/JzC0loNZAXcmeaabFuqrqZHpkgwDvwZ8lQHab9Pqgj7fZ0nOSvI0cAzYC/wb8GpVney6LCgf+yX8B92bqupK4Hrgjm6KYeBUb45x/c8zLtx9wC8AVwBHgI+v6WjOQJINwBeB91fVd6eu6+f9NkNdfb/PqurHVXUFvTskXA380lK20y/hP9C3haiqie7xGPBlejt0UBzt5l9PzcMeW+PxLJuqOtr9Iv4E+DR9ut+6ueMvAp+rqi91zX2/32aqa1D2GUBVvQp8Bfh1YGOSU1/aXVA+9kv4D+xtIZKc052QIsk5wHXAs3O/qq/sAbZ1y9uAh9ZwLMvqVDh23kof7rfuBOL9wAtV9Ykpq/p6v81WV7/vsyRvTLKxW349vYtgXqD3IfC2rtuC9ldfXO0D0F2S9cf8320hPrq2I1oeSX6e3tE+9G638fl+rS3JF4BRereXPQrcA/wd8CDws8BLwC1V1XcnTmepbZTe9EEBh4H3Tpkn7wtJ3gT8E3AQ+EnX/AF68+N9u9/mqOtW+nifJfkVeid0z6J38P5gVX2oy5Ex4Hzgn4Hfqaofzbmtfgl/SdLy6ZdpH0nSMjL8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoP+BweMU0oHLqrWAAAAAElFTkSuQmCC\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "df_contracts[df_contracts.a_age < 30].a_age.hist(bins=25)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Two very important distributions"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Normal\n",
- "\n",
- "Also known as Gaussian, is a bell-shaped distribution with mass around the mean and exponentially decaying on the sides. It is fully characterized by the mean (center of mass) and standard deviation (spread).\n",
- "\n",
- "https://en.wikipedia.org/wiki/Normal_distribution"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "s1 = np.random.normal(5, 1, 10000)\n",
- "sns.displot(s1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAD4CAYAAADSIzzWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAANqElEQVR4nO3df2jc933H8dfbkubI8tzOihGZFe8qRNKYGdpGdD86irVEiyUXbxACGyQyaGCIh+xmf4wNBLmA2F9jLOiPQUi3WKxkbGn3zyabKl5gLbTrTlk6Z7YxV1V25LWOKmd1ZSmqZL/3h0433elOkrfv3ftOej7A4O/X37vPW7G/T33veydi7i4AQPXtih4AAHYqAgwAQQgwAAQhwAAQhAADQJDGBzn44Ycf9lQqVaFRAGB7mpiY+LG7Hyje/0ABTqVSymQyyU0FADuAmV0vtZ9bEAAQhAADQBACDABBCDAABCHAABCEAANAEAIMAEEIMAAEIcAAEIQAA0AQAgwAQQgwAAQhwAAQhAADQBACDABBCDAABCHAABCEAANAEAIMAEEe6P8JByRhZGRE2Wy2auvdvHlTknTw4MGqrNfZ2anBwcGqrIX6RoBRddlsVu+9f0X39uyvynoN8z+RJP1osfL/3Bvmb1d8DWwfBBgh7u3Zr4VP91VlrearY5JUlfVW1wK2gnvAABCEAANAEAIMAEEIMAAEIcAAEIQAA0AQAgwAQQgwAAQhwAAQhAADQBACDABBCDAABCHAABCEAANAEAIMAEEIMAAEIcAAEIQAA0AQAgwAQQgwAAQhwAAQhAADQBACDABBCDAABCHAABCEAANAEAIMAEEIMAAEIcAAEIQAA0AQAgwAQQgwAAQhwAAQhAADQBACDABBCDAABCHAABCEAANAEAIMAEEIMAAEIcAAEIQAb8HIyIhGRkaixwB2nO1+7jVGD1APstls9AjAjrTdzz2ugAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCBVCfDs7KzOnDmj2dnZktvl9m302Gw2u27/iy++qNOnTxc8RyaTUXd3t1544QUNDAyot7dXmUxGp0+f1qlTp/T888+ru7tbPT096u7u1okTJ3T06FE999xz+TXm5+eVzWbXzQag8paWlnTmzBllMhn19vbqmWeeUX9/v/r6+pTNZiUVdmFgYEBHjx7VwMBA/hxe24tsNqvjx4/nH7uZcm1KQlUCfO7cOV26dEmjo6Mlt8vt2+ixw8PD6/ZfuXJFly9fLniOdDotd9cHH3ygyclJLSwsKJ1O6/Lly7p27Zqmp6fl7lpaWpK7686dO5KkmZmZ/Bo3btzQ3bt3180GoPJu3bqlS5cuKZ1Oa2FhQYuLi7px44bm5+c1PDwsqbALk5OTkqTJycn8Oby2F8PDw7p7927+sZsp16YkVDzAs7OzunDhgtxdFy5cUDabLdienZ1dd8zaq9pyj52amirYf/78+fya58+f1+zsrDKZjObm5tbNVGpfKatrLC4uSlLBbAAqb2lpSbdv35a7lzxvp6amNDExUdCF4j9f24uxsbH8MVNTU5teBZdrU1IaE322Es6dO6f79+9Lku7du6fh4eGC7dHRUbn7un0vvfTSho9dtbp/eXk5v29paUmjo6O6ePFiol/L4uKiTp06pfb29kSfd6fJZrPa9TOPHqMidn18R9nsT3X27NnoUbaFa9euyX3jfysvv/zyui6Us7S0VLA9PDysN954o+zxxQ1abVNSNr0CNrNTZpYxs8zMzMwDL/D222/n47i8vKypqamC7fHx8XXHjI+Pb/rYVav71/4lubvGx8e3fKX7ID766KPEnxNAacXneylzc3NbOq6U4ivmYuXalJRNr4Dd/TVJr0lSV1fXA1+2PP300xobG9Py8rIaGxvV3t6u6enp/HZPT0/+pcHafZs9Nv8F5PZfv349H2EzU09Pjy5evJhohBsbG3X8+PFEvwPuRGfPntXE5K3oMSri/kP71NnRpldffTV6lG3h2Wef3fRl/969e/Xxxx//nyKcSqU2/PPiBq22KSkVvwd88uRJ7dq1skxDQ4OGhoYKtvv7+9cd09/fv+ljV63ub2z83+8lTU1N6u/vVzqdTvRrWTsbgMpra2uTmW14zCuvvLKuC+U0NTUVbA8NDW14fLk2JaXiAW5tbdWxY8dkZjp27Jg6OzsLtltbW9cd09rauuljU6lUwf7e3t78mr29vWptbVVXV5f27t27bqZS+0pZXWP37t2SVDAbgMpramrS/v37ZWYlz9tUKqUnn3yyoAvFf762F319ffljUqmUOjs7N1y/XJuSUpWPoZ08eVJHjhwpuLJdu11u30aPHRoaWrf/iSee0OHDhwueI51Oy8z06KOPqqOjQ83NzUqn0zp8+LAee+wxtbe3y8zU1NQkM9O+ffskSQcOHMivcejQIbW0tHD1CwRoa2vTkSNHlE6n1dzcrN27d+vQoUPas2dP/gp2bRc6OjokSR0dHflzeG0vhoaG1NLSsunV76pybUqCbfYO41pdXV2eyWQSH6LWrb6jzX29ZKzeA174dF9V1mu+OiZJVVmv+eqYnuQecGK2y7lnZhPu3lW8nx9FBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAgjRGD1APOjs7o0cAdqTtfu4R4C0YHByMHgHYkbb7ucctCAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAgjRGD4CdqWH+tpqvjlVprVlJqsp6DfO3JbVVfB1sDwQYVdfZ2VnV9W7eXJYkHTxYjTC2Vf3rQ/0iwKi6wcHB6BGAmsA9YAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCEGAACEKAASAIAQaAIAQYAIIQYAAIQoABIAgBBoAgBBgAghBgAAhCgAEgCAEGgCAEGACCmLtv/WCzGUnXNznsYUk//v8MVSX1MGc9zCjVx5z1MKNUH3My44P7JXc/ULzzgQK8FWaWcfeuRJ+0AuphznqYUaqPOethRqk+5mTG5HALAgCCEGAACFKJAL9WgeeshHqYsx5mlOpjznqYUaqPOZkxIYnfAwYAbA23IAAgCAEGgCCJBdjM/srMPjSz95N6zqSZ2aNm9o6ZXTaz/zSzs9EzlWJmD5nZd83se7k5X4meqRwzazCzfzezf4yepRwzmzKzS2b2nplloucpxcw+aWZvmdlVM7tiZr8WPVMxM3s8999w9dcdM/ty9FzFzOyl3Hnzvpm9aWYPRc9UTmL3gM3si5LmJI26+y8n8qQJM7NHJD3i7u+a2c9LmpD0O+5+OXi0AmZmklrcfc7MmiR9S9JZd/9O8GjrmNkfSuqStM/dvxQ9TylmNiWpy91r6YP5BczsnKRvuvvrZvZzkva4+38Hj1WWmTVIuinpV9x9sx/OqhozO6iV8+Wwuy+Y2d9JGnP3N2InKy2xK2B3/xdJt5N6vkpw9x+6+7u53/9U0hVJB2OnWs9XzOU2m3K/au7dUjNrl3Rc0uvRs9QzM/uEpC9K+ookufvPajm+OU9J+n4txXeNRknNZtYoaY+k/wqep6wdew/YzFKSPivpX4NHKSn30v49SR9KGnf3WpzzLyT9kaT7wXNsxiV9w8wmzOxU9DAlfErSjKS/zt3Oed3MWqKH2sTvSnozeohi7n5T0p9JuiHph5J+4u7fiJ2qvB0ZYDPbK+lrkr7s7nei5ynF3e+5+2cktUv6vJnV1G0dM/uSpA/dfSJ6li34DXf/nKReSX+Qu11WSxolfU7SX7r7ZyXdlfTHsSOVl7tFckLS30fPUszMfkHSb2vlm9ovSmoxs+djpypvxwU4d0/1a5K+6u5fj55nM7mXou9IOhY8SrEvSDqRu7/6t5J+08z+Jnak0nJXRXL3DyX9g6TPx060zrSk6TWvct7SSpBrVa+kd939VvQgJTwt6QfuPuPuS5K+LunXg2cqa0cFOPfm1lckXXH3P4+epxwzO2Bmn8z9vllSj6SroUMVcfc/cfd2d09p5eXoP7t7zV1pmFlL7g1X5V7W/5akmvqkjrv/SNIHZvZ4btdTkmrqjeEiv6cavP2Qc0PSr5rZntz5/pRW3uupSUl+DO1NSd+W9LiZTZvZ7yf13An6gqQXtHK1tvpRmr7ooUp4RNI7ZvYfkv5NK/eAa/ZjXjWuTdK3zOx7kr4r6Z/c/ULwTKUMSvpq7u/8M5L+NHac0nLfxHq0cmVZc3KvIt6S9K6kS1ppXM3+WDI/igwAQXbULQgAqCUEGACCEGAACEKAASAIAQaAIAQYAIIQYAAI8j84qcWbzlK/eAAAAABJRU5ErkJggg==\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "# for boxplots see https://en.wikipedia.org/wiki/Interquartile_range (or ask!)\n",
- "sns.boxplot(x=s1)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Heavy-tailed\n",
- "Distributions with a small but non-negligible amount of observations with high values. Several probability distributions follow this pattern: https://en.wikipedia.org/wiki/Heavy-tailed_distribution#Common_heavy-tailed_distributions.\n",
- "\n",
- "We pick the lognormal here: https://en.wikipedia.org/wiki/Log-normal_distribution"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAFgCAYAAACFYaNMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWCElEQVR4nO3df5Bd5X3f8fd3tVoJ4wQJqmGIxAzymHHDZNqakR1sUk+L0hiTTEQzBJh4LBkkkxLHtU0nCa7/SPtX644bbDwpRkZWUEr9i5BCHBdXBmzq2gjLP+ofYIcNro00YITLLhln791d3W//uM+ur+SVdrXas8/q3vdrZuee85znnP2ePauPzj73nHMjM5EkLb+h2gVI0qAygCWpEgNYkioxgCWpEgNYkioZrl3A6bjyyivzwQcfrF2GJM0n5mo8o8+AX3jhhdolSNKindEBLElnMgNYkioxgCWpEgNYkioxgCWpEgNYkioxgCWpEgNYkioxgCWpEgNYkioxgCWpEgNYkioxgCWpEgNYkioZyAButVq0Wq3aZUgacAMZwJK0EhjAklSJASxJlRjAklSJASxJlQxkAGcmrVaLzKxdiqQBNpAB3G632X7no7Tb7dqlSBpgAxnAAEOrR2qXIGnADWwAS1JtBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVdJoAEfEuyPiOxHx7Yj4WESsjYjNEXEgIkYj4hMRMVL6rinzo2X5RU3WJkm1NRbAEbER+NfAlsz8JWAVcD3wPuC2zHwl8CKws6yyE3ixtN9W+klS32p6CGIYOCsihoGXAc8CVwD3luV3A1eX6W1lnrJ8a0REw/VJUjWNBXBmHgbeD/yQbvCOA18FxjJzunQ7BGws0xuBZ8q606X/ecdvNyJuioiDEXHwyJEjTZUvSY1rcghiPd2z2s3ALwBnA1ee7nYzc3dmbsnMLRs2bDjdzUlSNU0OQfwq8P3MPJKZU8B9wOXAujIkAbAJOFymDwMXApTl5wA/bqo4PxlZUm1NBvAPgcsi4mVlLHcr8ATwCHBN6bMDuL9MP1DmKcsfzgbTsTM95ScjS6qqyTHgA3TfTPsa8K3yvXYDfwTcEhGjdMd495RV9gDnlfZbgFubqm2Gn4wsqabh+bssXmb+MfDHxzU/Dbx2jr4t4LebrEeSVhLvhJOkSgxgSarEAJakSgxgSapk4AJ45vpfSapt4AK43W5zw52fp9Pp1C5F0oAbuAAGWOX1v5JWgIEMYElaCQxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgY6gI9OTdJqtWqXIWlADXQAS1JNBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBrAkVdJoAEfEuoi4NyK+GxFPRsTrIuLciNgfEU+V1/Wlb0TE7RExGhHfjIhLm6xNkmpr+gz4g8CDmfkPgX8MPAncCjyUmRcDD5V5gDcBF5evm4A7Gq5NkqpqLIAj4hzgDcAegMyczMwxYBtwd+l2N3B1md4G7Muux4B1EXFBU/VJUm1NngFvBo4AeyPi6xFxV0ScDZyfmc+WPs8B55fpjcAzPesfKm3HiIibIuJgRBw8cuRIg+VLUrOaDOBh4FLgjsx8NfATfjrcAEBmJpCnstHM3J2ZWzJzy4YNG5asWElabk0G8CHgUGYeKPP30g3kH80MLZTX58vyw8CFPetvKm2NyUxarRbd/wckaXk1FsCZ+RzwTES8qjRtBZ4AHgB2lLYdwP1l+gFge7ka4jJgvGeoohGd6Sl27X2Mdrvd5LeRpDkNN7z9dwD3RMQI8DRwA93Q/2RE7AR+AFxb+n4GuAoYBf6+9G3c0OqR5fg2kvQzGg3gzPwGsGWORVvn6JvA25usR5JWEu+Ek6RKDGBJqsQAlqRKDGBJqsQAlqRKDGBJqsQAlqRKDGBJqsQAlqRKDGBJqsQAlqRKDGBJqsQAlqRKDGBJqsQAlqRKDGBJqsQAlqRKDGBJqsQAlqRKDGBJqsQAlqRKBj6AM5NWq0X3Q5klafkMfAB3pqfYfuejtNvt2qVIGjADH8AAQ6tHapcgaQAZwJJUiQEsSZUYwJJUiQEsSZUYwJJUiQEsSZUYwJJUyYICOCIuX0ibJGnhFnoG/KEFtkmSFmj4ZAsj4nXA64ENEXFLz6KfB1Y1WZgk9buTBjAwAry89Pu5nvaXgGuaKkqSBsFJAzgzvwB8ISL+LDN/sEw1SdJAmO8MeMaaiNgNXNS7TmZe0URRkjQIFhrAnwI+DNwFHG2uHEkaHAsN4OnMvKPRSiRpwCz0MrS/iojfi4gLIuLcma9GK5OkPrfQM+Ad5fUPetoSeMXSliNJg2NBAZyZm5suRJIGzYICOCK2z9WemfuWthxJGhwLHYJ4Tc/0WmAr8DXAAJakRVroEMQ7eucjYh3w8SYKkqRBsdjHUf4EcFxYkk7DQseA/4ruVQ/QfQjPLwKfbKooSRoECx0Dfn/P9DTwg8w81EA9kjQwFjQEUR7K8126T0RbD0w2WZQkDYKFfiLGtcDjwG8D1wIHIsLHUUrSaVjoEMR7gddk5vMAEbEB+Bxwb1OFSVK/W+hVEEMz4Vv8+BTWlSTNYaFnwA9GxGeBj5X564DPNFOSJA2G+T4T7pXA+Zn5BxHxW8CvlEVfBu5pujhJ6mfzDSN8gO7nv5GZ92XmLZl5C/CXZVlfODo1SavVql2GpAEzXwCfn5nfOr6xtF3USEWSNCDmC+B1J1l21hLWIUkDZ74APhgRbzu+MSJ2AV9tpiRJGgzzXQXxLuAvI+LN/DRwtwAjwL9ssC5J6nsnDeDM/BHw+oj458Avlea/zsyHG69MkvrcQp8H/AjwSMO1SNJAafxutohYFRFfj4hPl/nNEXEgIkYj4hMRMVLa15T50bL8oqZrk6SaluN24ncCT/bMvw+4LTNfCbwI7CztO4EXS/ttpZ8k9a1GAzgiNgG/DtxV5gO4gp8+xOdu4Ooyva3MU5ZvLf0lqS81fQb8AeAPgU6ZPw8Yy8zpMn8I2FimNwLPAJTl46X/MSLipog4GBEHjxw50mDpktSsxgI4In4DeD4zl/R64czcnZlbMnPLhg0blnLTkrSsFvo0tMW4HPjNiLiK7kfZ/zzwQWBdRAyXs9xNwOHS/zBwIXAoIoaBc+g+9lKS+lJjZ8CZ+Z7M3JSZFwHXAw9n5pvpXs4282kaO4D7y/QDZZ6y/OHMTCSpT9V4qPofAbdExCjdMd49pX0PcF5pvwW4tUJtkrRsmhyCmJWZnwc+X6afBl47R58W3c+cW3aZSavVIjPxwgtJy8WPFQI601Ps2vsY7Xa7dimSBogBXAytHqldgqQBYwBLUiUGsCRVYgBLUiUGsCRVYgBLUiUGsCRVYgBLUiUGsCRVYgBLUiUDF8CtVotOpzN/R0lq2MAFsCStFAawJFViAEtSJQZwcXRqklarVbsMSQPEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgxgSarEAJakSgzgIjNptVpkZu1SJA0IA7joTE+xa+9jtNvt2qVIGhAGcI+h1SO1S5A0QAxgSarEAO7hOLCk5WQA9+hMT7H9zkcdB5a0LAzg4zgOLGm5GMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBLEmVGMCSVIkBfJyjU5O0Wq3aZUgaAAbwHFqtliEsqXGNBXBEXBgRj0TEExHxnYh4Z2k/NyL2R8RT5XV9aY+IuD0iRiPimxFxaVO1SdJK0OQZ8DTwbzLzEuAy4O0RcQlwK/BQZl4MPFTmAd4EXFy+bgLuaLA2SaqusQDOzGcz82tl+u+AJ4GNwDbg7tLtbuDqMr0N2JddjwHrIuKCpuqTpNqWZQw4Ii4CXg0cAM7PzGfLoueA88v0RuCZntUOlbbjt3VTRByMiINHjhxprmhJaljjARwRLwf+AnhXZr7UuywzE8hT2V5m7s7MLZm5ZcOGDUtY6ez2abVadEuTpOY0GsARsZpu+N6TmfeV5h/NDC2U1+dL+2Hgwp7VN5W2ZdWZnuLmfY/TbreX+1tLGjBNXgURwB7gycz8k55FDwA7yvQO4P6e9u3laojLgPGeoYplNbR6pMa3lTRghhvc9uXAW4BvRcQ3Stu/Bf4j8MmI2An8ALi2LPsMcBUwCvw9cEODtUlSdY0FcGZ+EYgTLN46R/8E3t5UPZK00ngnnCRVYgBLUiUGsCRVYgBLUiUGsCRVYgDPwbvhJC0HA3gOnekpdu19zLvhJDXKAD4B74aT1DQD+AQchpDUNAP4BDrTU2y/81GHISQ1xgA+CYchJDXJAJakSgzgk/Aj6iU1yQCWpEoMYEmqxACWpEoMYEmqxACWpEoMYEmqxACeR6vV8lI0SY0wgCWpEgNYkioxgCWpEgNYkioxgE/CZwJLapIBfBKd6Slu3ve4zwSW1AgDeB4+E1hSUwxgSarEAJ5HZjIxMcHExIRjwZKWlAE8j870FDd+5H9x3e37HQuWtKQM4AUYWj3iWLCkJWcAS1IlBrAkVWIAS1IlBrAkVWIAS1IlBvACHZ2a9MHskpaUAbxAPphH0lIzgBeoMz3Frr2PeTOGpCVjAJ+CBIchJC0ZA1iSKjGAF8kxYUmnywBepHa7zXUf+pxjwpIWzQA+Dat8QI+k0zBcu4AzycyzgdesWVO7FEl9wAA+BTPPBo4I/stbXjM7DrxmzRoionZ5ks4wDkGcoqHVIxDBzfseZ3qyzfY7H3UcWNKiGMCLNPOAdh/ULmmxDGBJqsQAlqRKDODTND3ZZmxsjImJCW9TlnRKDODT1JmeYudHv8z4+Lh3xUk6JQbwIh2dmqTT6XRnIth51xdnr4bwNmVJC2EAL5EYXj07FOFtypIWwgBeIp3pKW7e9/hs6HqbsqT5GMBLyGuCJZ0KA3gJOfYr6VQYwEuo94qIXjMP8ZmYmKDT6RjSkgADeOmVKyImWxO8+OKLTExMMDY2xjX/+a+57vb9vPTSS75BJwnwaWiNGFo9wtGpydknp9127T9iaPUIMbyaVqt1zBt0mUm73faJatIA8gy4ATPXCM88Oe0d//UrdDodOtNTvPWOh5menqbVajE2Nsb4+PicZ8THjyc7viz1nxUVwBFxZUR8LyJGI+LW2vUsld6rI2ame8eFM5OxsbFjwnZ8fJxrb99/zG3O133oc7RarZ8J4plw7nQ6x2xT0sq2YoYgImIV8KfAvwAOAV+JiAcy84m6lS2dmTPjmYD9V3u/RKwaJoaG2PnRL3PXDZeRmUQEN+7+AhlD/M6H9jM8soaPvPWXGRpezfj4ODv3/G/uuvH1s5/MERHs+MgX2b39Nez66JeIoVXs+903zA5rrFmzhna7fcqhPLPu5OQkIyMjs9uICNauXUtEzIY/cExb77DKXH1O5mTDMr3LgAUN38y3zpk8DHQm134maernHCvlTCkiXgf8u8x8Y5l/D0Bm/ocTrbNly5Y8ePDgKX2fsbExrr/tM8SqYfLo9Oxrp9Nh1eqRRU0vZhvTkxNErDpmG9OT3askVg2PMBRxzHpTU21G1p49ux4RZGf62P6Zs+vNLI+hYXbfeDm/u+dRMlYxNDTE0elJYmj4pNPQvZlkz87LedveL/GRG17Pjj/9n0wfPcrIWS9n79v+6ezP9MbdnydWreaet28FoNVq8ba9X+LPb76CtWvX0mq1+J3bH5zts3bt2pMeo1arxVvueHh2/d723m0Dc/Y72fbmWudE328l633wU+3ae/9z7VczvyOfevdVi93POVN7JQXwNcCVmbmrzL8F+OXM/P3j+t0E3FRmXwV8bxHf7h8AL5xGuWcS97V/DdL+nun7+kJmXnl844oZgliozNwN7D6dbUTEwczcskQlrWjua/8apP3t131dSW/CHQYu7JnfVNokqS+tpAD+CnBxRGyOiBHgeuCByjVJUmNWzBBEZk5HxO8DnwVWAR/NzO809O1OawjjDOO+9q9B2t++3NcV8yacJA2alTQEIUkDxQCWpEoGKoD74VbniLgwIh6JiCci4jsR8c7Sfm5E7I+Ip8rr+tIeEXF72edvRsSlPdvaUfo/FRE7au3TfCJiVUR8PSI+XeY3R8SBsk+fKG/aEhFryvxoWX5RzzbeU9q/FxFvrLQr84qIdRFxb0R8NyKejIjX9euxjYh3l9/hb0fExyJibT8f2zll5kB80X1j72+BVwAjwP8BLqld1yL24wLg0jL9c8DfAJcA/wm4tbTfCryvTF8F/A+6d+JcBhwo7ecCT5fX9WV6fe39O8E+3wL8N+DTZf6TwPVl+sPAzWX694APl+nrgU+U6UvK8V4DbC6/B6tq79cJ9vVuYFeZHgHW9eOxBTYC3wfO6jmmb+3nYzvX1yCdAb8WGM3MpzNzEvg4sK1yTacsM5/NzK+V6b8DnqT7y7yN7j9eyuvVZXobsC+7HgPWRcQFwBuB/Zn5/zLzRWA/8DN36tQWEZuAXwfuKvMBXAHcW7ocv68zP4N7ga2l/zbg45nZzszvA6N0fx9WlIg4B3gDsAcgMyczc4w+PbZ0r8I6KyKGgZcBz9Knx/ZEBimANwLP9MwfKm1nrPJn2KuBA8D5mflsWfQccH6ZPtF+nyk/jw8Afwh0yvx5wFhmTpf53rpn96ksHy/9z5R93QwcAfaWIZe7IuJs+vDYZuZh4P3AD+kG7zjwVfr32M5pkAK4r0TEy4G/AN6VmS/1Lsvu32Zn/PWFEfEbwPOZ+dXatSyTYeBS4I7MfDXwE7pDDrP66Niup3v2uhn4BeBsVuZZeqMGKYD75lbniFhNN3zvycz7SvOPyp+flNfnS/uJ9vtM+HlcDvxmRPxfukNGVwAfpPun9sxNRL11z+5TWX4O8GPOjH2F7tnbocw8UObvpRvI/XhsfxX4fmYeycwp4D66x7tfj+2cBimA++JW5zLutQd4MjP/pGfRA8DMu907gPt72reXd8wvA8bLn7OfBX4tItaXs5FfK20rRma+JzM3ZeZFdI/Xw5n5ZuAR4JrS7fh9nfkZXFP6Z2m/vryTvhm4GHh8mXZjwTLzOeCZiHhVadoKPEEfHlu6Qw+XRcTLyu/0zL725bE9odrvAi7nF913jf+G7jul761dzyL34Vfo/gn6TeAb5esquuNhDwFPAZ8Dzi39g+6D7v8W+BawpWdbN9J902IUuKH2vs2z3/+Mn14F8Qq6/8hGgU8Ba0r72jI/Wpa/omf995afwfeAN9Xen5Ps5z8BDpbj+9/pXsXQl8cW+PfAd4FvA39O90qGvj22c315K7IkVTJIQxCStKIYwJJUiQEsSZUYwJJUiQEsSZUYwJJUiQEsSZX8f+QOJFZPad5aAAAAAElFTkSuQmCC\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "s2 = np.random.lognormal(5, 1, 10000)\n",
- "sns.displot(s2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAD4CAYAAADSIzzWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAN4UlEQVR4nO3df2xVZx3H8c+X3rLBphuUpZlloZAukv3lZmNmNIZsbKPViH/sjyUmVI0x0aQgW2K2jIQQ+o/GMIEYl4XFtEbddC7KTIsB3f4ULTrGpAyu7CLcbB3cuV+2dLQ8/nGf3t57uaWl5d7vvb3vV3LTc57znOc5z9PTD+eec5ZZCEEAgMpb5H0AAFCvCGAAcEIAA4ATAhgAnBDAAOAkcS2VV6xYEVpbW8t0KACwMB05cuRCCOG24vJrCuDW1lYNDg5ev6MCgDpgZmdKlXMLAgCcEMAA4IQABgAnBDAAOCGAAcAJAQwATghgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJ9f0/4Sbr7179yqZTEqS0um0JKmlpaWgTltbm7q7uyt5WADgoqIBnEwm9errQ5pYulwNI+9Lkt4emzqEhpF3K3k4AOCqogEsSRNLl2t0baeWnOiXJI2u7cxtmywDgHrAPWAAcEIAA4ATAhgAnBDAAOCEAAYAJwQwADghgAHACQEMAE4IYABwQgADgBMCGACcEMAA4IQABgAnBDAAOCGAAcAJAQwATghgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJwQwADghgAHACQEMAE4IYABwQgADgBMCGACcEMAA4IQABgAnBDAAOCGAAcAJAQwATghgAHCSqEQne/fuLVub3d3d171tAKiEigRwMpmsiTYBoJK4BQEATghgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJwQwADghgAHACQEMAE4IYABwQgADgBMCGACcEMAA4IQABgAnBDAAOCGAAcAJAQwATghgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJwnvA5iro0ePSpLWrVtX8b4bGho0MTExYz0zUwih5LaWlhZduHBBZqbm5mYNDw/n6ra0tGhsbEzpdFqPPvqo9u/fr3Q6rebmZi1evDh3DI899pj27NmjzZs3a9euXRoZGdHw8LBWrFihTCajnTt3at++fZqYmFBDQ4N6enokSTt27NDmzZu1Z88ebd++XU1NTcpkMiXLZzK5X3H9TCajbdu2ycy0c+fOgj5m03Zx3VL7Xkt71Sx/HJKqakwLZY7no5xzwBXwHMwmfCVNG76SlE6nNTY2posXL+rMmTO6ePGixsbGNDY2ptOnTyudTkuSnnrqKSWTSY2OjiqVSunkyZM6efKkhoaG1NPTo2PHjqmnp0fHjx9XKpXS6Oiozp49q5GREW3fvl1DQ0O5+n19fert7c3tc+zYMfX19UnStOUzmdyvuH5vb6+GhoZ0/PjxK/qYTdvFdUvtey3tVbP8cVTbmKrteDyUcw5qMoA9rnq9XC3EU6mUQghKpVIlt3/00UcF6/39/Tpw4EBunxCCDhw4oGQyWbI8k8lc9dgymUxuv/z6mUxGAwMDuXoDAwMFfczUdnG7pfadru9akz+OgYEBDQwMVM2YFsocz0e556AityDS6bRGR0clSYs+nj5QFl38QMnkh9qyZUslDqvuXLp0SWZWUDYxMaGenh5dvnz5ivK+vj5t3bp12vZ6e3tz++XX7+3t1fj4eEG/+X3M1HZxu6X2DSHMur1qlj/WS5cu5cqrYUzT/X7rSbnnYMYrYDP7jpkNmtng+fPnr1vH8FF8RT0+Pq5UKlUQmJPlBw8evGpbhw4dyu2XX//QoUMF/UxeWZeqO5t2S+07Xd+1Jn8cIYTcvFXDmBbKHM9HuedgxivgEMIzkp6RpPb29ukvX6+ipaUlt3zk9PC09S7f+Em1rWnW7t27r9pePd2CuN6KHwwmEgmtXLlS586dKwjhRCKhBx544KptrV+/Xv39/RofHy+ov379er300ku5fsxMq1atyvUxU9vF7eYf3+S+IYSSfdea/LFOfjsJIVTFmKb7/daTcs9BTd4Dxtw0NjaqsbGxoKyhoUHbtm3TokWLrijftGnTVdvr6urK7Zdfv6urS4nE1L/tjY2NBX3M1HZxu6X2na7vWpM/jsbGxty8VcOYFsocz0e556AmA/iVV17xPoSKKb5nm6+1tVVmptbW1pLbb7755oL1zs5ObdiwIbePmWnDhg1qa2srWT7TKzdNTU25/fLrNzU1qaOjI1evo6OjoI+Z2i5ut9S+0/Vda/LH0dHRoY6OjqoZ00KZ4/ko9xzU7HvAnir5HvDWrVvn/B7wjh07Ct4DnvzXO5VK5d73zb9qLVU+k8n9iut3dXXp1KlTMrMr+phN28V1S+17Le1Vs+JxVNOYFsocz0c558Cu9ppTsfb29jA4OHjNneS/1XDk9LBG13ZqyYl+SdLo2s7ctiUn+vXZWdwDzm9zNnUBwJOZHQkhtBeX1+QtCABYCAhgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJwQwADghgAHACQEMAE4IYABwQgADgBMCGACcEMAA4IQABgAnBDAAOCGAAcAJAQwATghgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJ4lKdNLW1iZJSiaT171NAKhVFQng7u5uSdKWLVuue5sAUKu4BQEATghgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJwQwADghgAHACQEMAE4IYABwQgADgBMCGACcEMAA4IQABgAnBDAAOCGAAcAJAQwATghgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJwQwADghgAHASaLSHTaMvKslJ/rVMJKRJC050V+wTWqu9CEBgIuKBnBbW1tuOZ0elyS1tOQHbnNBHQBYyCoawN3d3ZXsDgCqGveAAcAJAQwATghgAHBCAAOAEwIYAJwQwADghAAGACcEMAA4IYABwAkBDABOCGAAcEIAA4ATAhgAnBDAAOCEAAYAJwQwADghgAHACQEMAE4IYABwQgADgBMLIcy+stl5SWfm2NcKSRfmuO9CwjxMYS6mMBdZC3UeVoUQbisuvKYAng8zGwwhtFeksyrGPExhLqYwF1n1Ng/cggAAJwQwADipZAA/U8G+qhnzMIW5mMJcZNXVPFTsHjAAoBC3IADACQEMAE7KHsBmtsHM3jCzpJk9Xu7+PJjZHWb2spkdN7N/mdmWWL7czA6a2an4c1ksNzPbE+fkNTO7J6+trlj/lJl1eY1pPsyswcz+aWZ/jOurzexwHO/zZrY4lt8Q15Nxe2teG0/E8jfM7CGnocyLmd1qZi+Y2QkzGzKzz9fjOWFmW+Pfxetm9mszu7Fez4krhBDK9pHUIOnfktZIWizpqKS7ytmnx0fS7ZLuicufkHRS0l2SfiTp8Vj+uKQfxuVOSQOSTNK9kg7H8uWSTsefy+LyMu/xzWE+HpX0K0l/jOu/kfRIXH5a0nfj8vckPR2XH5H0fFy+K54rN0haHc+hBu9xzWEeeiV9Oy4vlnRrvZ0TklokvSlpSd658I16PSeKP+W+Av6cpGQI4XQI4WNJz0naWOY+Ky6E8FYI4R9x+UNJQ8qeeBuV/SNU/Pm1uLxRUl/I+qukW83sdkkPSToYQng3hPBfSQclbajcSObPzFZK+rKkfXHdJN0n6YVYpXgeJufnBUn3x/obJT0XQhgLIbwpKansuVQzzOwWSV+S9KwkhRA+DiG8pzo8JyQlJC0xs4SkpZLeUh2eE6WUO4BbJJ3NWz8Xyxas+JXpbkmHJTWHEN6Km96W1ByXp5uXhTBfP5H0A0mX43qTpPdCCONxPX9MufHG7e/H+gthHlZLOi/p5/F2zD4zu0l1dk6EENKSfizpP8oG7/uSjqg+z4kr8BDuOjKzmyX9TtL3Qwgf5G8L2e9RC/qdPzP7iqR3QghHvI+lCiQk3SPpZyGEuyX9T9lbDjl1ck4sU/bqdbWkT0m6SbV3BV825Q7gtKQ78tZXxrIFx8walQ3fX4YQXozFw/FrpOLPd2L5dPNS6/P1BUlfNbOUsreb7pO0W9mv04lYJ39MufHG7bdIyqj250HKXqGdCyEcjusvKBvI9XZOrJf0ZgjhfAjhkqQXlT1P6vGcuEK5A/jvku6MTzwXK3tTfX+Z+6y4eI/qWUlDIYRdeZv2S5p8at0l6Q955Zvik+97Jb0fv5b+SdKDZrYsXjk8GMtqQgjhiRDCyhBCq7K/67+EEL4u6WVJD8dqxfMwOT8Px/ohlj8Sn4ivlnSnpL9VaBjXRQjhbUlnzezTseh+ScdVZ+eEsrce7jWzpfHvZHIe6u6cKKncT/mUfbp7Utmnlk96P3Us0xi/qOxXydckvRo/ncreu/qzpFOSDklaHuubpJ/GOTkmqT2vrW8p+4AhKemb3mObx5ys09RbEGuU/WNJSvqtpBti+Y1xPRm3r8nb/8k4P29I6vAezxzn4DOSBuN58Xtl32Kou3NC0g5JJyS9LukXyr7JUJfnRPGH/xQZAJzwEA4AnBDAAOCEAAYAJwQwADghgAHACQEMAE4IYABw8n/Y4kuV5/nC1AAAAABJRU5ErkJggg==\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "sns.boxplot(x=s2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAFgCAYAAACFYaNMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAX1ElEQVR4nO3dfaxc9X3n8fc3xjc8xNjEcW3nmsh2YmVLXDUGF5OQjVLYVECzga3SbKJugiK63s06iDSrpKT9o6qUlZLdqrhBiK4F2ZjdBEoJyDSLICwPKZUKjWHcMITscuMF44uxzUMcCGUvxt/9Yw6X8XBtD/Y985uH90u6uuf85sw9X1m+n5n7m99DZCaSpN57S+kCJGlUGcCSVIgBLEmFGMCSVIgBLEmFHFe6gGNx3nnn5e233166DEk6kpipcaDfAT/zzDOlS5CkozbQASxJg8wAlqRCDGBJKsQAlqRCDGBJKsQAlqRCDGBJKsQAlqRCDGBJKsQAlqRCDGBJKsQAlqRCag3giHg8Ih6OiG0RsbVqe3tE3BkRj1XfT6naIyK+GRETEfHjiDi9ztokqbReLEf5m5nZvmzZ5cBdmfn1iLi8Ov9D4HxgVfW1Dri6+i7NqqmpKRqNxkFta9asYWxsrFBFGlUl1gO+EPhIdbwZuJdWAF8IXJetbZrvj4gFEbE0M3cVqFFDrNFocOlVW5g/vhKAfZPbuXIDrFvn6716q+4ATuAHEZHAf83MTcDitlB9GlhcHY8DT7Y9d2fVdlAAR8R6YD3Au971rhpL1zCbP76ShSveV7oMjbi6A/hDmTkZEb8C3BkRP21/MDOzCueuVSG+CWDt2rVv6rmS1E9q/RAuMyer73uAW4Azgd0RsRSg+r6nunwSOLXt6cuqNkkaSrUFcEScFBHzXjsGfgtoArcCF1eXXQxsqY5vBT5bjYY4C9hn/6+kYVZnF8Ri4JaIeO0+383M2yPiR8CNEXEJ8ATwyer624ALgAngJeBzNdamIeYoBw2K2gI4M7cDvz5D+7PAuTO0J7Chrno0OhzloEEx0NvSS4fiKAcNAqciS1IhvgPWyDvw6n6azeZBbfYZqxcMYI28F3bvYOPjL7NkojWs3D5j9YoBLAHzliy3z1g9Zx+wJBXiO2ANvc4+3mazSR5wFrvKM4A19Dr7eCe33ceCVWcUrkoygDUi2vt49z21vXA1UosBLHXB6c2qgwEsdcHpzaqDASx1yenNmm0OQ5OkQgxgSSrEAJakQgxgSSrEAJakQhwFoYHXOUbXqcYaFAawBl7nGF2nGmtQGMAaCu1jdJ1qrEFhAEsdZtohw24N1cEAljp0rp4GdmuoHgawNIPOHTLs1lAdHIYmSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiMPQpKMw02QN94jTm2UAS0ehc7LG8zse4/PnNFm9evX0NQayjsQAlo5S51b3G+94ZDqQ3bRT3TCApVnSOXtOOhI/hJOkQgxgSSrEAJakQgxgSSrEAJakQgxgSSrEAJakQgxgSSrEAJakQpwJp4EyNTVFo9E4qM0dizWoDGANlEajwaVXbWH++MrpNncs1qAygDVw5o+vdMdiDQX7gCWpEANYkgoxgCWpEANYkgoxgCWpEANYkgoxgCWpkNoDOCLmREQjIr5fna+IiAciYiIi/ioixqr2t1bnE9Xjy+uuTZJK6sU74MuAR9vOvwFckZnvAZ4HLqnaLwGer9qvqK6TpKFVawBHxDLgt4FrqvMAzgFuqi7ZDFxUHV9YnVM9fm51vSQNpbrfAW8EvgIcqM4XAj/PzP3V+U5gvDoeB54EqB7fV11/kIhYHxFbI2Lr3r17ayxdkupVWwBHxMeAPZn54Gz+3MzclJlrM3PtokWLZvNHS1JP1bkYz9nAxyPiAuB44GTgL4AFEXFc9S53GTBZXT8JnArsjIjjgPnAszXWJ0lF1fYOODO/mpnLMnM58Cng7sz8PeAe4BPVZRcDW6rjW6tzqsfvzkwXeZU0tEqMA/5D4EsRMUGrj/faqv1aYGHV/iXg8gK1SVLP9GQ94My8F7i3Ot4OnDnDNS8Dv9uLejQ4OnfAcPcLDRMXZFdf69wBw90vNEwMYPW99h0w3P1Cw8S1ICSpEANYkgoxgCWpEANYkgoxgCWpEANYkgoxgCWpEANYkgoxgCWpEGfCSTU48Op+ms3mQW1r1qxhbGysUEXqRwawVIMXdu9g4+Mvs2SitXDQ8zse4/PnNFm9evX0NQayDGCpJvOWLD9oDYuNdzwyHcj7Jrdz5QZYt25dyRJVmAEs9Uh7IEvgh3CSVIwBLEmFGMCSVIgBLEmFGMCSVIijINRX3IRTo8QAVl9xE06NEgNYfcdNODUq7AOWpEIMYEkqxACWpEIMYEkqxACWpEIMYEkqxACWpEIcBywVMNOWReAuGaPGAJYK6NyyCNwlYxQZwFIh7pAh+4AlqRADWJIKMYAlqRADWJIKMYAlqRADWJIKMYAlqRADWJIKMYAlqRADWJIKMYAlqRADWJIKMYAlqRADWJIKcTlKFTU1NUWj0Zg+bzab5IE8zDOk4WEAq6hGo8GlV21h/vhKACa33ceCVWcUrkrqDQNYxc0fXzm9MPm+p7YXrkbqHfuAJakQA1iSCqktgCPi+Ij4h4j4x4h4JCL+tGpfEREPRMRERPxVRIxV7W+tzieqx5fXVZsk9YM63wH/P+CczPx14P3AeRFxFvAN4IrMfA/wPHBJdf0lwPNV+xXVdZI0tGoL4Gx5sTqdW30lcA5wU9W+GbioOr6wOqd6/NyIiLrqk6TSau0Djog5EbEN2APcCfwM+Hlm7q8u2QmMV8fjwJMA1eP7gIUz/Mz1EbE1Irbu3bu3zvIlqVa1BnBmvpqZ7weWAWcC/2wWfuamzFybmWsXLVp0rD9OkorpySiIzPw5cA/wAWBBRLw2/ngZMFkdTwKnAlSPzwee7UV9klRCnaMgFkXEgur4BOCjwKO0gvgT1WUXA1uq41urc6rH785M56RKGlp1zoRbCmyOiDm0gv7GzPx+RPwEuCEivgY0gGur668F/ntETADPAZ+qsTZJKq62AM7MHwNrZmjfTqs/uLP9ZeB366pH6ncHXt1Ps9k8qG3NmjWMjY0Vqkh1cy0IqU+8sHsHGx9/mSUTrZ63fZPbuXIDrFu3rnBlqosBLPWReUuWTy9MpOHnWhCSVIgBLEmFdBXAEXF2N22SpO51+w74yi7bJEldOuyHcBHxAeCDwKKI+FLbQycDc+osTJKG3ZFGQYwBb6uum9fW/gten80mSToKhw3gzPwh8MOI+HZmPtGjmiRpJHQ7DvitEbEJWN7+nMw8p46iJGkUdBvAfw38JXAN8Gp95UjS6Og2gPdn5tW1ViJJI6bbAP6biPgPwC209noDIDOfq6UqDYWpqSkajcYb2l1gRmrpNoBfW6f3y21tCayc3XI0TBqNBpdetYX546//N3GBGel1XQVwZq6ouxANp/njK11cRjqErgI4Ij47U3tmXje75UjS6Oi2C+I32o6PB84FHgIMYEk6St12QVzafl7t9XZDHQVJ0qg42uUofwnYLyxJx6DbPuC/oTXqAVqL8PwqcGNdRUnSKOi2D/jP2o73A09k5s4a6pGkkdFVF0S1KM9Paa2IdgowVWdRkjQKuu2C+CTwX4B7gQCujIgvZ+ZNNdamIdS59Xqz2SQP5GGeIQ2vbrsg/hj4jczcAxARi4D/BRjAelM6t16f3HYfC1adUbiq/tT5YgVO4x423QbwW14L38qzuKGnjlL71uv7ntpeuJr+1fli5TTu4dNtAN8eEXcA11fn/xq4rZ6SJL2m/cVKw+dIe8K9B1icmV+OiN8BPlQ99PfAd+ouTpKG2ZHeAW8EvgqQmTcDNwNExK9Vj/3LGmuTpKF2pH7cxZn5cGdj1ba8lookaUQcKYAXHOaxE2axDkkaOUcK4K0R8W87GyPi94EH6ylJkkbDkfqAvwjcEhG/x+uBuxYYA/5VjXVJ0tA7bABn5m7ggxHxm8Dqqvl/ZubdtVcmSUOu2/WA7wHuqbkWSRopzmaTpEIMYEkqxACWpEIMYEkqxACWpEIMYEkqxACWpEIMYEkqpNsF2aUjmpqaotFoTJ+735t0eAawZk2j0eDSq7Ywf3wl4H5v0pEYwJpV88dXut+b1CX7gCWpEANYkgoxgCWpEANYkgoxgCWpEANYkgoxgCWpkNoCOCJOjYh7IuInEfFIRFxWtb89Iu6MiMeq76dU7RER34yIiYj4cUScXldtktQP6nwHvB/4j5l5GnAWsCEiTgMuB+7KzFXAXdU5wPnAquprPXB1jbVJUnG1BXBm7srMh6rjF4BHgXHgQmBzddlm4KLq+ELgumy5H1gQEUvrqk+SSutJH3BELAfWAA8AizNzV/XQ08Di6ngceLLtaTurts6ftT4itkbE1r1799ZXtCTVrPa1ICLibcD3gC9m5i8iYvqxzMyIeFPLZWXmJmATwNq1a11qSyPjwKv7aTabB7WtWbOGsbGxQhXpWNUawBExl1b4ficzb66ad0fE0szcVXUx7KnaJ4FT256+rGqTBLywewcbH3+ZJROt9x37Jrdz5QZYt25d4cp0tOocBRHAtcCjmfnnbQ/dClxcHV8MbGlr/2w1GuIsYF9bV4UkYN6S5Sxc8T4Wrnjf9LKfGlx1vgM+G/gM8HBEbKva/gj4OnBjRFwCPAF8snrsNuACYAJ4CfhcjbVJUnG1BXBm/h0Qh3j43BmuT2BDXfVIUr9xJpwkFWIAS1IhBrAkFWIAS1IhBrAkFeKuyNKAcmbc4DOApQHlzLjBZwBLA+y1mXEaTPYBS1IhBrAkFWIAS1Ih9gFLQ2KmURHgyIh+ZgBLQ6JzVAQ4MqLfGcDSEHFUxGCxD1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCnE9YB21qakpGo3G9Hmz2SQP5GGeIamdAayj1mg0uPSqLcwfXwnA5Lb7WLDqjMJVSYPDANYxmT++cnoHhn1PbS9cjTRY7AOWpEIMYEkqxACWpEIMYEkqxACWpEIcBaFD6hzn+8orrwAwd+5cwHG/0rEygHVIM43znTNvIUvefdr0ueN++9uBV/fTbDYPaluzZg1jY2OFKlI7A1iH1TnO97j5Sxz3O0Be2L2DjY+/zJKJ1l8q+ya3c+UGWLduXeHKBAawNPTmLVk+/aKp/uKHcJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUSG0BHBHfiog9EdFsa3t7RNwZEY9V30+p2iMivhkRExHx44g4va66JKlf1PkO+NvAeR1tlwN3ZeYq4K7qHOB8YFX1tR64usa6JKkv1BbAmfm3wHMdzRcCm6vjzcBFbe3XZcv9wIKIWFpXbZLUD3rdB7w4M3dVx08Di6vjceDJtut2Vm1vEBHrI2JrRGzdu3dvfZVKUs2KrQecmRkRb3o/m8zcBGwCWLt2rfvhzKLOLYjcckiqV68DeHdELM3MXVUXw56qfRI4te26ZVWbemimLYjcckiqT68D+FbgYuDr1fctbe1fiIgbgHXAvrauCvVQ5xZEGi7uEddfagvgiLge+AjwjojYCfwJreC9MSIuAZ4APlldfhtwATABvAR8rq66pFHmHnH9pbYAzsxPH+Khc2e4NoENddUi6XXuEdc/nAknSYUYwJJUiAEsSYUUGwcsqTxHRZRlAI+ozkkX4MSLUeSoiLIM4BHVOekCnHgxqhwVUY4BPMLaJ12AEy+kXvNDOEkqxACWpEIMYEkqxACWpEL8EG5EuNav1H8M4BHhWr9S/zGAR4hr/Ur9xT5gSSrEAJakQgxgSSrEAJakQvwQTtI0l6fsLQNY0jSXp+wtA1jSQVyesncM4CHlzDep/xnAQ8qZb1L/M4CHmDPfpP7mMDRJKsQAlqRCDGBJKsQAlqRC/BBO0iE5M65eBvCQcNyv6tA5M+75HY/x+XOarF69+qDrDOWjYwAPCcf9qi7tM+P2PbWdjXc8Mh3I4HTlY2EADxHH/aoXOqcq201x9AxgScfEBXyOngEs6Zi5gM/RcRiaJBViAEtSIQawJBViH7CkWeWoiO4ZwJJmlaMiumcADyhnvqmfOSqiOwbwgJgpcP/y3gkWLHs34Mw3aRAZwAPiUFONnfkmDS4DeIA41VgaLgZwH+jsXgA/NZZGgQHcBzq7F/zUWBoNBnCfaO9ekIaJ44IPzQDuQzP9h3WYmQZVN4u6j2ogG8B9qPM/LDjMTIPtcIu6j3KXmwHcpzoHsjvqQcOk/f/3KHdRGMCSihrlqcsGcA84zEw6vFGdumwA94DDzKTuzdQlAcP5pqWvAjgizgP+ApgDXJOZXy9cUlc63+G+8sorAMydOxdojWA4eemKkXyFl96smT6EHtaRE30TwBExB7gK+CiwE/hRRNyamT+Zzfsca3fATM+faWGcOfMWsuTdp02ft49g6HyFd4iZdLCZPoRuHznRGcidb3rg4N/rmX5vj3TNkX7mbOibAAbOBCYycztARNwAXAjMagA3Gg0++8dXcNI7lgLwy2d28ZVPf/SgV9bDaTab/Ofr75x+PsAzP2syf8WvHfZ5Lzz9OM+edCIAux7+e752/4uc8s7mQc+PtwQAL+6ZZM4/vTx9/Uxts30+qPcY1Lr9tzmKe8xbOH3+0nNP87XNEwf9Ds054WROeee7gDf+Xs/0e3uka2b6mdf9pz+Y1a7DyOyPd14R8QngvMz8/er8M8C6zPxCx3XrgfXV6XuB/30Ut3sH8MwxlDsb+qEG6I86+qEG6I86rOF1/VDHbNXwTGae19nYT++Au5KZm4BNx/IzImJrZq6dpZIGtoZ+qaMfauiXOqyhv+qou4Z+2pRzEji17XxZ1SZJQ6mfAvhHwKqIWBERY8CngFsL1yRJtembLojM3B8RXwDuoDUM7VuZ+UhNtzumLoxZ0g81QH/U0Q81QH/UYQ2v64c6aq2hbz6Ek6RR009dEJI0UgxgSSpkpAI4Ir4VEXsi4o0TzXtXw6kRcU9E/CQiHomIywrUcHxE/ENE/GNVw5/2uoa2WuZERCMivl+whscj4uGI2BYRWwvVsCAiboqIn0bEoxHxgQI1vLf6N3jt6xcR8cUCdfxB9f+yGRHXR8TxBWq4rLr/I3X+G4xUH3BEfBh4EbguM7ub+jb7NSwFlmbmQxExD3gQuGi2p1wfoYYATsrMFyNiLvB3wGWZeX+vamir5UvAWuDkzPxYr+9f1fA4sDYziw36j4jNwH2ZeU01CujEzPx5wXrm0BoGui4zn+jhfcdp/X88LTP/KSJuBG7LzG/3sIbVwA20ZudOAbcD/z4zJ2b7XiP1Djgz/xZ4rnANuzLzoer4BeBRYLzHNWRmvlidzq2+ev5KHBHLgN8Grun1vftJRMwHPgxcC5CZUyXDt3Iu8LNehm+b44ATIuI44ETgqR7f/1eBBzLzpczcD/wQ+J06bjRSAdxvImI5sAZ4oMC950TENmAPcGdm9rwGYCPwFeBAgXu3S+AHEfFgNdW911YAe4H/VnXHXBMRJxWoo92ngOt7fdPMnAT+DNgB7AL2ZeYPelxGE/jnEbEwIk4ELuDgSWKzxgAuJCLeBnwP+GJm/qLX98/MVzPz/bRmHJ5Z/dnVMxHxMWBPZj7Yy/sewocy83TgfGBD1VXVS8cBpwNXZ+Ya4JfA5T2uYVrVBfJx4K8L3PsUWotwrQDeCZwUEf+mlzVk5qPAN4Af0Op+2Aa8Wse9DOACqn7X7wHfycybS9ZS/al7D/CGhUJqdjbw8ar/9QbgnIj4Hz2uAZh+10Vm7gFuodX310s7gZ1tf4XcRCuQSzkfeCgzdxe4978A/m9m7s3MV4CbgQ/2uojMvDYzz8jMDwPPA/+njvsYwD1WfQB2LfBoZv55oRoWRcSC6vgEWmsw/7SXNWTmVzNzWWYup/Xn7t2Z2dN3OgARcVL1YSjVn/2/RetP0J7JzKeBJyPivVXTuczyMqxv0qcp0P1Q2QGcFREnVr8r59L6nKSnIuJXqu/votX/+9067tM3U5F7ISKuBz4CvCMidgJ/kpnX9riMs4HPAA9XfbAAf5SZt/WwhqXA5uqT7rcAN2ZmsWFghS0Gbmn9rnMc8N3MvL1AHZcC36n+/N8OfK5ADa+9CH0U+Hcl7p+ZD0TETcBDwH6gQZkpyd+LiIXAK8CGuj4UHalhaJLUT+yCkKRCDGBJKsQAlqRCDGBJKsQAlqRCDGBJKsQAlqRC/j+pJYsbFLnGdwAAAABJRU5ErkJggg==\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Why \"lognormal\"?\n",
- "\n",
- "sns.displot(np.log(s2))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Box plots\n",
- "\n",
- " "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Outliers, missing values\n",
- "\n",
- "An *outlier* is an observation far from the center of mass of the distribution. It might be an error or a genuine observation: this distinction requires domain knowledge. Outliers infuence the outcomes of several statistics and machine learning methods: it is important to decide how to deal with them.\n",
- "\n",
- "A *missing value* is an observation without a value. There can be many reasons for a missing value: the value might not exist (hence its absence is informative and it should be left empty) or might not be known (hence the value is existing but missing in the dataset and it should be marked as NA).\n",
- "\n",
- "*One way to think about the difference is with this Zen-like koan: An explicit missing value is the presence of an absence; an implicit missing value is the absence of a presence.*"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Summary statistics\n",
- "A statistic is a function of a collection of observations, or otherwise stated a measure over a distribution. \n",
- "\n",
- "A statistic is said to be *robust* if not sensitive to outliers.\n",
- "\n",
- "* Not robust: min, max, mean, standard deviation.\n",
- "* Robust: mode, median, other quartiles.\n",
- "\n",
- "A closer look at the mean:\n",
- "\n",
- "$\\bar{x} = \\frac{1}{n} \\sum_{i}x_i$\n",
- "\n",
- "And variance (the standard deviation is the square root of the variance):\n",
- "\n",
- "$Var(x) = \\frac{1}{n} \\sum_{i}(x_i - \\bar{x})^2$\n",
- "\n",
- "The mean, the median, etc. are measures of location (e.g., the typical value); the variance is a measure of dispersion."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "4.9780085768562925\n",
- "251.20393308182292\n"
- ]
- }
- ],
- "source": [
- "# Not robust: min, max, mean, mode, standard deviation\n",
- "\n",
- "print(np.mean(s1)) # should be 5\n",
- "print(np.mean(s2))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "4.990913153725976\n",
- "148.01429155699833\n"
- ]
- }
- ],
- "source": [
- "# Robust: median, other quartiles\n",
- "\n",
- "print(np.quantile(s1, 0.5)) # should coincide with mean and mode\n",
- "print(np.quantile(s2, 0.5))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Questions\n",
- "\n",
- "* Calculate the min, max, mode and sd. *hint: explore the numpy documentation!*\n",
- "* Calculate the 90% quantile values.\n",
- "* Consider our normally distributed data in s1. Add an outlier (e.g., value 100). What happens to the mean and mode? Write down your answer and then check."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " annual_salary \n",
- " a_age \n",
- " length \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " count \n",
- " 7870.000000 \n",
- " 9303.000000 \n",
- " 9645.000000 \n",
- " \n",
- " \n",
- " mean \n",
- " 5.916921 \n",
- " 14.266688 \n",
- " 5.005694 \n",
- " \n",
- " \n",
- " std \n",
- " 6.985214 \n",
- " 2.902770 \n",
- " 1.462343 \n",
- " \n",
- " \n",
- " min \n",
- " 0.166667 \n",
- " 1.000000 \n",
- " 0.083333 \n",
- " \n",
- " \n",
- " 25% \n",
- " 3.000000 \n",
- " 12.000000 \n",
- " 4.000000 \n",
- " \n",
- " \n",
- " 50% \n",
- " 4.000000 \n",
- " 14.000000 \n",
- " 5.000000 \n",
- " \n",
- " \n",
- " 75% \n",
- " 6.000000 \n",
- " 16.000000 \n",
- " 6.000000 \n",
- " \n",
- " \n",
- " max \n",
- " 180.000000 \n",
- " 50.000000 \n",
- " 15.000000 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " annual_salary a_age length\n",
- "count 7870.000000 9303.000000 9645.000000\n",
- "mean 5.916921 14.266688 5.005694\n",
- "std 6.985214 2.902770 1.462343\n",
- "min 0.166667 1.000000 0.083333\n",
- "25% 3.000000 12.000000 4.000000\n",
- "50% 4.000000 14.000000 5.000000\n",
- "75% 6.000000 16.000000 6.000000\n",
- "max 180.000000 50.000000 15.000000"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Let's explore our dataset\n",
- "df_contracts[[\"annual_salary\",\"a_age\",\"length\"]].describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Relating two variables\n",
- "\n",
- "### Covariance\n",
- "\n",
- "Measure of association, specifically of the joint linear variability of two variables:\n",
- "\n",
- " \n",
- "\n",
- "Its normalized version is called the (Pearson's) correlation coefficient:\n",
- "\n",
- " \n",
- "\n",
- "Correlation is helpful to spot possible relations, but is of tricky interpretation and is not exhaustive:\n",
- "\n",
- " \n",
- "\n",
- "See: https://en.wikipedia.org/wiki/Covariance and https://en.wikipedia.org/wiki/Pearson_correlation_coefficient.\n",
- "\n",
- "*Note: correlation is not causation!*"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " annual_salary \n",
- " a_age \n",
- " length \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " annual_salary \n",
- " 1.000000 \n",
- " 0.205404 \n",
- " -0.361611 \n",
- " \n",
- " \n",
- " a_age \n",
- " 0.205404 \n",
- " 1.000000 \n",
- " -0.430062 \n",
- " \n",
- " \n",
- " length \n",
- " -0.361611 \n",
- " -0.430062 \n",
- " 1.000000 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " annual_salary a_age length\n",
- "annual_salary 1.000000 0.205404 -0.361611\n",
- "a_age 0.205404 1.000000 -0.430062\n",
- "length -0.361611 -0.430062 1.000000"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_contracts[[\"annual_salary\",\"a_age\",\"length\"]].corr()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "sns.scatterplot(x=df_contracts.length,y=df_contracts.annual_salary)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Other ways to measure correlation exist. For example, if you are interested into how one variable will increase (or decrease) as another variable increases (or decreases), the *Spearmanās or Kendallās rank correlation coefficients* might work well."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Questions\n",
- "\n",
- "* Try to explore the correlation of other variables in the dataset.\n",
- "* Can you think of a possible motivation for the trend we see: older apprentices with a shorter contract getting on average a higher annual salary?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Sampling and uncertainty\n",
- "\n",
- "Often, we work with samples and we want the sample to be representative of the population it is taken from, in order to draw conclusions that generalise from the sample to the full population.\n",
- "\n",
- "Sampling is *tricky*. Samples have *variance* (variation between samples from the same population) and *bias* (systematic variation from the population)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Further reading\n",
- "\n",
- "* For a complementary introduction to statistics and data analysis, see https://www.humanitiesdataanalysis.org/statistics-essentials/notebook.html.\n",
- "* Related to statistics and data analysis is the realm of probability theory, which allows us to formally model and calculate the likelihood of events. For an introduction, see https://www.humanitiesdataanalysis.org/intro-probability/notebook.html."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "---"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Part 2: working with texts\n",
- "\n",
- "Let's get some basics (or a refresher) of working with texts in Python. Texts are sequences of discrete symbols (words or, more generically, tokens).\n",
- "\n",
- "Key challenge: representing text for further processing. Two mainstream approaches:\n",
- "* *Bag of words*: a text is a collection of tokens occurring with a certain frequence and assumed independently from each other within the text. The mapping from texts to features is determinsitic and straighforward, each text is represented as a vector of the size of the vocabulary.\n",
- "* *Embeddings*: a method is used (typically, neural networks), to learn a mapping from each token to a (usually small) vector representing it. A text can be represented in turn as an aggregation of these embeddings."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Import the dataset\n",
- "Let us import the Elon Musk's tweets dataset in memory.\n",
- "\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "root_folder = \"../data/musk_tweets\"\n",
- "df_elon = pd.read_csv(codecs.open(os.path.join(root_folder,\"elonmusk_tweets.csv\"), encoding=\"utf8\"), sep=\",\")\n",
- "df_elon['text'] = df_elon['text'].str[1:]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " created_at \n",
- " text \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 849636868052275200 \n",
- " 2017-04-05 14:56:29 \n",
- " 'And so the robots spared humanity ... https:/... \n",
- " \n",
- " \n",
- " 1 \n",
- " 848988730585096192 \n",
- " 2017-04-03 20:01:01 \n",
- " \"@ForIn2020 @waltmossberg @mims @defcon_5 Exac... \n",
- " \n",
- " \n",
- " 2 \n",
- " 848943072423497728 \n",
- " 2017-04-03 16:59:35 \n",
- " '@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- " \n",
- " \n",
- " 3 \n",
- " 848935705057280001 \n",
- " 2017-04-03 16:30:19 \n",
- " 'Stormy weather in Shortville ...' \n",
- " \n",
- " \n",
- " 4 \n",
- " 848416049573658624 \n",
- " 2017-04-02 06:05:23 \n",
- " \"@DaveLeeBBC @verge Coal is dying due to nat g... \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id created_at \\\n",
- "0 849636868052275200 2017-04-05 14:56:29 \n",
- "1 848988730585096192 2017-04-03 20:01:01 \n",
- "2 848943072423497728 2017-04-03 16:59:35 \n",
- "3 848935705057280001 2017-04-03 16:30:19 \n",
- "4 848416049573658624 2017-04-02 06:05:23 \n",
- "\n",
- " text \n",
- "0 'And so the robots spared humanity ... https:/... \n",
- "1 \"@ForIn2020 @waltmossberg @mims @defcon_5 Exac... \n",
- "2 '@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- "3 'Stormy weather in Shortville ...' \n",
- "4 \"@DaveLeeBBC @verge Coal is dying due to nat g... "
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_elon.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(2819, 3)"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_elon.shape"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Natural Language Processing in Python"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [],
- "source": [
- "# import some of the most popular libraries for NLP in Python\n",
- "import spacy\n",
- "import nltk\n",
- "import string\n",
- "import sklearn"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[nltk_data] Downloading package punkt to /Users/matteo/nltk_data...\n",
- "[nltk_data] Unzipping tokenizers/punkt.zip.\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "nltk.download('punkt')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "A typical NLP pipeline might look like the following:\n",
- " \n",
- " \n",
- "\n",
- "### Tokenization: splitting a text into constituent tokens"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [],
- "source": [
- "from nltk.tokenize import TweetTokenizer, word_tokenize\n",
- "tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\"@ForIn2020 @waltmossberg @mims @defcon_5 Exactly. Tesla is absurdly overvalued if based on the past, but that's irr\\xe2\\x80\\xa6 https://t.co/qQcTqkzgMl\"\n"
- ]
- }
- ],
- "source": [
- "example_tweet = df_elon.text[1]\n",
- "print(example_tweet)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['\"', '@ForIn2020', '@waltmossberg', '@mims', '@defcon_5', 'Exactly', '.', 'Tesla', 'is', 'absurdly', 'overvalued', 'if', 'based', 'on', 'the', 'past', ',', 'but', \"that's\", 'irr', '\\\\', 'xe2', '\\\\', 'x80', '\\\\', 'xa6', 'https://t.co/qQcTqkzgMl', '\"']\n",
- "['``', '@', 'ForIn2020', '@', 'waltmossberg', '@', 'mims', '@', 'defcon_5', 'Exactly', '.', 'Tesla', 'is', 'absurdly', 'overvalued', 'if', 'based', 'on', 'the', 'past', ',', 'but', 'that', \"'s\", 'irr\\\\xe2\\\\x80\\\\xa6', 'https', ':', '//t.co/qQcTqkzgMl', \"''\"]\n"
- ]
- }
- ],
- "source": [
- "tkz1 = tknzr.tokenize(example_tweet)\n",
- "print(tkz1)\n",
- "tkz2 = word_tokenize(example_tweet)\n",
- "print(tkz2)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Question: can you spot what the Twitter tokenizer is doing instead of a standard one?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "string.punctuation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [],
- "source": [
- "# some more pre-processing\n",
- "\n",
- "def filter(tweet):\n",
- " \n",
- " # remove punctuation and short words and urls\n",
- " tweet = [t for t in tweet if t not in string.punctuation and len(t) > 3 and not t.startswith(\"http\")]\n",
- " return tweet\n",
- "\n",
- "def tokenize_and_string(tweet):\n",
- " \n",
- " tkz = tknzr.tokenize(tweet)\n",
- " \n",
- " tkz = filter(tkz)\n",
- " \n",
- " return \" \".join(tkz)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['\"', '@ForIn2020', '@waltmossberg', '@mims', '@defcon_5', 'Exactly', '.', 'Tesla', 'is', 'absurdly', 'overvalued', 'if', 'based', 'on', 'the', 'past', ',', 'but', \"that's\", 'irr', '\\\\', 'xe2', '\\\\', 'x80', '\\\\', 'xa6', 'https://t.co/qQcTqkzgMl', '\"']\n",
- "['@ForIn2020', '@waltmossberg', '@mims', '@defcon_5', 'Exactly', 'Tesla', 'absurdly', 'overvalued', 'based', 'past', \"that's\"]\n"
- ]
- }
- ],
- "source": [
- "print(tkz1)\n",
- "print(filter(tkz1))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_elon[\"clean_text\"] = df_elon[\"text\"].apply(tokenize_and_string)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " created_at \n",
- " text \n",
- " clean_text \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 849636868052275200 \n",
- " 2017-04-05 14:56:29 \n",
- " 'And so the robots spared humanity ... https:/... \n",
- " robots spared humanity \n",
- " \n",
- " \n",
- " 1 \n",
- " 848988730585096192 \n",
- " 2017-04-03 20:01:01 \n",
- " \"@ForIn2020 @waltmossberg @mims @defcon_5 Exac... \n",
- " @ForIn2020 @waltmossberg @mims @defcon_5 Exact... \n",
- " \n",
- " \n",
- " 2 \n",
- " 848943072423497728 \n",
- " 2017-04-03 16:59:35 \n",
- " '@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- " @waltmossberg @mims @defcon_5 Walt \n",
- " \n",
- " \n",
- " 3 \n",
- " 848935705057280001 \n",
- " 2017-04-03 16:30:19 \n",
- " 'Stormy weather in Shortville ...' \n",
- " Stormy weather Shortville \n",
- " \n",
- " \n",
- " 4 \n",
- " 848416049573658624 \n",
- " 2017-04-02 06:05:23 \n",
- " \"@DaveLeeBBC @verge Coal is dying due to nat g... \n",
- " @DaveLeeBBC @verge Coal dying fracking It's ba... \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id created_at \\\n",
- "0 849636868052275200 2017-04-05 14:56:29 \n",
- "1 848988730585096192 2017-04-03 20:01:01 \n",
- "2 848943072423497728 2017-04-03 16:59:35 \n",
- "3 848935705057280001 2017-04-03 16:30:19 \n",
- "4 848416049573658624 2017-04-02 06:05:23 \n",
- "\n",
- " text \\\n",
- "0 'And so the robots spared humanity ... https:/... \n",
- "1 \"@ForIn2020 @waltmossberg @mims @defcon_5 Exac... \n",
- "2 '@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
- "3 'Stormy weather in Shortville ...' \n",
- "4 \"@DaveLeeBBC @verge Coal is dying due to nat g... \n",
- "\n",
- " clean_text \n",
- "0 robots spared humanity \n",
- "1 @ForIn2020 @waltmossberg @mims @defcon_5 Exact... \n",
- "2 @waltmossberg @mims @defcon_5 Walt \n",
- "3 Stormy weather Shortville \n",
- "4 @DaveLeeBBC @verge Coal dying fracking It's ba... "
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_elon.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {},
- "outputs": [],
- "source": [
- "# save cleaned up version\n",
- "\n",
- "df_elon.to_csv(os.path.join(root_folder,\"df_elon.csv\"), index=False)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Building a dictionary"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(2819, 7864)"
- ]
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.feature_extraction.text import CountVectorizer\n",
- "count_vect = CountVectorizer(lowercase=False, tokenizer=tknzr.tokenize)\n",
- "X_count = count_vect.fit_transform(df_elon.clean_text)\n",
- "X_count.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "6617"
- ]
- },
- "execution_count": 45,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "word_list = count_vect.get_feature_names_out() \n",
- "count_list = X_count.toarray().sum(axis=0)\n",
- "dictionary = dict(zip(word_list,count_list))\n",
- "count_vect.vocabulary_.get(\"robots\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "3"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X_count[:,count_vect.vocabulary_.get(\"robots\")].toarray().sum()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "3"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dictionary[\"robots\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Questions\n",
- "\n",
- "* Find the tokens most used by Elon.\n",
- "* Find the twitter users most referred to by Elon (hint: use the @ handler to spot them)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('Tesla', 322),\n",
- " ('Model', 236),\n",
- " ('that', 223),\n",
- " ('will', 218),\n",
- " ('with', 177),\n",
- " ('@SpaceX', 169),\n",
- " ('from', 163),\n",
- " ('this', 159),\n",
- " ('@TeslaMotors', 149),\n",
- " ('launch', 124)]"
- ]
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dictionary_list = sorted(dictionary.items(), key=lambda x:x[1], reverse=True)\n",
- "[d for d in dictionary_list][:10]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('@SpaceX', 169),\n",
- " ('@TeslaMotors', 149),\n",
- " ('@elonmusk', 85),\n",
- " ('@NASA', 48),\n",
- " ('@Space_Station', 19),\n",
- " ('@FredericLambert', 17),\n",
- " ('@ID_AA_Carmack', 15),\n",
- " ('@WIRED', 14),\n",
- " ('@vicentes', 14),\n",
- " ('@BadAstronomer', 11)]"
- ]
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dictionary_list_users = sorted(dictionary.items(), key=lambda x:x[1], reverse=True)\n",
- "[d for d in dictionary_list if d[0].startswith('@')][:10]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Representing tweets as vectors\n",
- "\n",
- "Texts are of variable length and need to be represented numerically in some way. Most typically, we represent them as *equally-sized vectors*.\n",
- "\n",
- "Actually, this is what we have already done! Let's take a closer look at `X_count` above.."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "id 849636868052275200\n",
- "created_at 2017-04-05 14:56:29\n",
- "text 'And so the robots spared humanity ... https:/...\n",
- "clean_text robots spared humanity\n",
- "Name: 0, dtype: object"
- ]
- },
- "execution_count": 50,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# This is the first Tweet of the data frame\n",
- "\n",
- "df_elon.loc[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {},
- "outputs": [],
- "source": [
- "# let's get the vector representation for this Tweet\n",
- "\n",
- "vector_representation = X_count[0,:]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "3"
- ]
- },
- "execution_count": 52,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# there are 3 positions not to zero, as we would expect: the vector contains 1 in the columns related to the 3 words that make up the Tweet. \n",
- "# It would contain a number higher than 1 if a given word were occurring multiple times.\n",
- "\n",
- "np.sum(vector_representation)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1\n",
- "1\n",
- "1\n"
- ]
- }
- ],
- "source": [
- "# Let's check that indeed the vector contains 1s for the right words\n",
- "# Remember, the vector has shape (1 x size of the vocabulary)\n",
- "\n",
- "print(vector_representation[0,count_vect.vocabulary_.get(\"robots\")])\n",
- "print(vector_representation[0,count_vect.vocabulary_.get(\"spared\")])\n",
- "print(vector_representation[0,count_vect.vocabulary_.get(\"humanity\")])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Term Frequency - Inverse Document Frequency\n",
- "We can use boolean counts (1/0) and raw counts (as we did before) to represent a Tweet over the space of the vocabulary, but there exist improvements on this basic idea. For example, the TF-IDF weighting scheme:\n",
- "\n",
- "$tfidf(t, d, D) = tf(t, d) \\cdot idf(t, D)$\n",
- "\n",
- "$tf(t, d) = f_{t,d}$\n",
- "\n",
- "$idf(t, D) = log \\Big( \\frac{|D|}{|{d \\in D: t \\in d}|} \\Big)$"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(2819, 7864)"
- ]
- },
- "execution_count": 54,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
- "count_vect = TfidfVectorizer(lowercase=False, tokenizer=tknzr.tokenize)\n",
- "X_count_tfidf = count_vect.fit_transform(df_elon.clean_text)\n",
- "X_count_tfidf.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1.7226760995112569"
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X_count_tfidf[0,:].sum()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "3"
- ]
- },
- "execution_count": 56,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X_count[0,:].sum()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Sparse vectors (mention)\n",
- "How is Python representing these vectors in memory? Most of their cells are set to zero. \n",
- "\n",
- "We call any vector or matrix whose cells are mostly to zero *sparse*.\n",
- "There are efficient ways to store them in memory."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "<1x7864 sparse matrix of type ''\n",
- "\twith 3 stored elements in Compressed Sparse Row format>"
- ]
- },
- "execution_count": 57,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "X_count_tfidf[0,:]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Spacy pipelines\n",
- "\n",
- "Useful to construct sequences of pre-processing steps: https://spacy.io/usage/processing-pipelines."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [
- {
- "ename": "OSError",
- "evalue": "[E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
- "Input \u001b[0;32mIn [58]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Load a pre-trained pipeline (Web Small): https://spacy.io/usage/models\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m#!python -m spacy download en_core_web_sm\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43men_core_web_sm\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m~/.pyenv/versions/3.10.0/envs/ada-dhoxss-2022/lib/python3.10/site-packages/spacy/__init__.py:51\u001b[0m, in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, exclude, config)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(\n\u001b[1;32m 31\u001b[0m name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[1;32m 32\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 36\u001b[0m config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[1;32m 37\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m name (str): Package name or model path.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;124;03m RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mutil\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdisable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexclude\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m~/.pyenv/versions/3.10.0/envs/ada-dhoxss-2022/lib/python3.10/site-packages/spacy/util.py:427\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, exclude, config)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[1;32m 426\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name])) \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[0;32m--> 427\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n",
- "\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory."
- ]
- }
- ],
- "source": [
- "# Load a pre-trained pipeline (Web Small): https://spacy.io/usage/models\n",
- "\n",
- "#!python -m spacy download en_core_web_sm\n",
- "nlp = spacy.load('en_core_web_sm')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "*.. the modelās meta.json tells spaCy to use the language \"en\" and the pipeline [\"tagger\", \"parser\", \"ner\"]. spaCy will then initialize spacy.lang.en.English, and create each pipeline component and add it to the processing pipeline. Itāll then load in the modelās data from its data directory and return the modified Language class for you to use as the nlp object.*\n",
- "\n",
- "Let's create a simple pipeline that does **lemmatization**, **part of speech tagging** and **named entity recognition** using spaCy models.\n",
- "\n",
- "*If you don't know what these NLP tasks are, please ask!*"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tweet_pos = list()\n",
- "tweet_ner = list()\n",
- "tweet_lemmas = list()\n",
- "\n",
- "for tweet in df_elon.text.values:\n",
- " spacy_tweet = nlp(tweet)\n",
- " \n",
- " local_tweet_pos = list()\n",
- " local_tweet_ner = list()\n",
- " local_tweet_lemmas = list()\n",
- " \n",
- " for sentence in list(spacy_tweet.sents):\n",
- " # --- lemmatization, remove punctuation and stop wors\n",
- " local_tweet_lemmas.extend([token.lemma_ for token in sentence if not token.is_punct | token.is_stop])\n",
- " local_tweet_pos.extend([token.pos_ for token in sentence if not token.is_punct | token.is_stop])\n",
- " for ent in spacy_tweet.ents:\n",
- " local_tweet_ner.append(ent)\n",
- "\n",
- " tweet_pos.append(local_tweet_pos)\n",
- " tweet_ner.append(local_tweet_ner)\n",
- " tweet_lemmas.append(local_tweet_lemmas)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tweet_lemmas[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tweet_pos[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tweet_ner[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# but it actually works!\n",
- "\n",
- "tweet_ner[3]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "*Note: we are really just scratching the surface of spaCy, but it is worth knowing it's there.*"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Searching tweets\n",
- "\n",
- "Once we have represented Tweets as vectors, we can easily find similar ones using basic operations such as filtering."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "target = 0\n",
- "print(df_elon.clean_text[target])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "condition = X_count_tfidf[target,:] > 0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(condition)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_filtered = X_count_tfidf[:,np.ravel(condition.toarray())]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_filtered"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(X_filtered)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from scipy import sparse\n",
- "\n",
- "sparse.find(X_filtered)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tweet_indices = list(sparse.find(X_filtered)[0])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\"TARGET: \" + df_elon.clean_text[target])\n",
- "\n",
- "for n, tweet_index in enumerate(list(set(tweet_indices))):\n",
- " if tweet_index != target:\n",
- " print(str(n) +\")\"+ df_elon.clean_text[tweet_index])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Questions\n",
- "\n",
- "* Can you rank the matched tweets using their tf-idf weights, so to put higher weighted tweets first?\n",
- "* Which limitations do you think a bag of words representation has?\n",
- "* Can you spot any limitations of this approach based on similarity measures over bag of words representations?"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "---"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/3.2 Exploratory data analysis and working with texts.ipynb b/notebooks/3.2 Exploratory data analysis and working with texts.ipynb
new file mode 100644
index 0000000..34d44c9
--- /dev/null
+++ b/notebooks/3.2 Exploratory data analysis and working with texts.ipynb
@@ -0,0 +1,2788 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3.2 Exploratory data analysis and working with texts\n",
+ "\n",
+ "In this notebook, we learn about:\n",
+ "1. descriptive statistics to explore data;\n",
+ "2. working with texts (hints)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Part 1: descriptive statistics\n",
+ "\n",
+ "*The goal of exploratory data analysis is to develop an understanding of your data. EDA is fundamentally a creative process. And like most creative processes, the key to asking quality questions is to generate a large quantity of questions.* \n",
+ "\n",
+ "Key questions:\n",
+ "* Which kind of variation occurs within variables?\n",
+ "* Which kind of co-variation occurs between variables?\n",
+ "\n",
+ "https://r4ds.had.co.nz/exploratory-data-analysis.html"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# imports\n",
+ "\n",
+ "import os, codecs\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import the dataset\n",
+ "Let us import the Venetian apprenticeship contracts dataset in memory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "root_folder = \"../data/apprenticeship_venice/\"\n",
+ "df_contracts = pd.read_csv(codecs.open(os.path.join(root_folder,\"professions_data.csv\"), encoding=\"utf8\"), sep=\";\")\n",
+ "df_professions = pd.read_csv(codecs.open(os.path.join(root_folder,\"professions_classification.csv\"), encoding=\"utf8\"), sep=\",\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's take another look to the dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 9653 entries, 0 to 9652\n",
+ "Data columns (total 47 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 page_title 9653 non-null object \n",
+ " 1 register 9653 non-null object \n",
+ " 2 annual_salary 7870 non-null float64\n",
+ " 3 a_profession 9653 non-null object \n",
+ " 4 profession_code_strict 9618 non-null object \n",
+ " 5 profession_code_gen 9614 non-null object \n",
+ " 6 profession_cat 9597 non-null object \n",
+ " 7 corporation 9350 non-null object \n",
+ " 8 keep_profession_a 9653 non-null int64 \n",
+ " 9 complete_profession_a 9653 non-null int64 \n",
+ " 10 enrolmentY 9628 non-null float64\n",
+ " 11 enrolmentM 9631 non-null float64\n",
+ " 12 startY 9533 non-null float64\n",
+ " 13 startM 9539 non-null float64\n",
+ " 14 length 9645 non-null float64\n",
+ " 15 has_fled 9653 non-null int64 \n",
+ " 16 m_profession 9535 non-null object \n",
+ " 17 m_profession_code_strict 9508 non-null object \n",
+ " 18 m_profession_code_gen 9506 non-null object \n",
+ " 19 m_profession_cat 9489 non-null object \n",
+ " 20 m_corporation 9276 non-null object \n",
+ " 21 keep_profession_m 9653 non-null int64 \n",
+ " 22 complete_profession_m 9653 non-null int64 \n",
+ " 23 m_gender 9554 non-null float64\n",
+ " 24 m_name 9623 non-null object \n",
+ " 25 m_surname 6960 non-null object \n",
+ " 26 m_patronimic 2620 non-null object \n",
+ " 27 m_atelier 1434 non-null object \n",
+ " 28 m_coords 9639 non-null object \n",
+ " 29 a_name 9653 non-null object \n",
+ " 30 a_age 9303 non-null float64\n",
+ " 31 a_gender 9522 non-null float64\n",
+ " 32 a_geo_origins 7149 non-null object \n",
+ " 33 a_geo_origins_std 4636 non-null object \n",
+ " 34 a_coords 9610 non-null object \n",
+ " 35 a_quondam 7848 non-null float64\n",
+ " 36 accommodation_master 9653 non-null int64 \n",
+ " 37 personal_care_master 9653 non-null int64 \n",
+ " 38 clothes_master 9653 non-null int64 \n",
+ " 39 generic_expenses_master 9653 non-null int64 \n",
+ " 40 salary_in_kind_master 9653 non-null int64 \n",
+ " 41 pledge_goods_master 9653 non-null int64 \n",
+ " 42 pledge_money_master 9653 non-null int64 \n",
+ " 43 salary_master 9653 non-null int64 \n",
+ " 44 female_guarantor 9653 non-null int64 \n",
+ " 45 period_cat 7891 non-null float64\n",
+ " 46 incremental_salary 9653 non-null int64 \n",
+ "dtypes: float64(11), int64(15), object(21)\n",
+ "memory usage: 3.5+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_contracts.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " page_title \n",
+ " register \n",
+ " annual_salary \n",
+ " a_profession \n",
+ " profession_code_strict \n",
+ " profession_code_gen \n",
+ " profession_cat \n",
+ " corporation \n",
+ " keep_profession_a \n",
+ " complete_profession_a \n",
+ " ... \n",
+ " personal_care_master \n",
+ " clothes_master \n",
+ " generic_expenses_master \n",
+ " salary_in_kind_master \n",
+ " pledge_goods_master \n",
+ " pledge_money_master \n",
+ " salary_master \n",
+ " female_guarantor \n",
+ " period_cat \n",
+ " incremental_salary \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Carlo Della sosta (Orese) 1592-08-03 \n",
+ " asv, giustizia vecchia, accordi dei garzoni, 1... \n",
+ " NaN \n",
+ " orese \n",
+ " orese \n",
+ " orefice \n",
+ " orefice \n",
+ " Oresi \n",
+ " 1 \n",
+ " 1 \n",
+ " ... \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " NaN \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Antonio quondam Andrea (squerariol) 1583-01-09 \n",
+ " asv, giustizia vecchia, accordi dei garzoni, 1... \n",
+ " 12.5 \n",
+ " squerariol \n",
+ " squerariol \n",
+ " lavori allo squero \n",
+ " lavori allo squero \n",
+ " Squerarioli \n",
+ " 1 \n",
+ " 1 \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1.0 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Cristofollo di Zuane (batioro in carta) 1591-0... \n",
+ " asv, giustizia vecchia, accordi dei garzoni, 1... \n",
+ " NaN \n",
+ " batioro \n",
+ " batioro \n",
+ " battioro \n",
+ " fabbricatore di foglie/fili/cordelle d'oro o a... \n",
+ " Battioro \n",
+ " 1 \n",
+ " 1 \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " NaN \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " 3 rows Ć 47 columns \n",
+ " "
+ ],
+ "text/plain": [
+ " page_title \\\n",
+ "0 Carlo Della sosta (Orese) 1592-08-03 \n",
+ "1 Antonio quondam Andrea (squerariol) 1583-01-09 \n",
+ "2 Cristofollo di Zuane (batioro in carta) 1591-0... \n",
+ "\n",
+ " register annual_salary \\\n",
+ "0 asv, giustizia vecchia, accordi dei garzoni, 1... NaN \n",
+ "1 asv, giustizia vecchia, accordi dei garzoni, 1... 12.5 \n",
+ "2 asv, giustizia vecchia, accordi dei garzoni, 1... NaN \n",
+ "\n",
+ " a_profession profession_code_strict profession_code_gen \\\n",
+ "0 orese orese orefice \n",
+ "1 squerariol squerariol lavori allo squero \n",
+ "2 batioro batioro battioro \n",
+ "\n",
+ " profession_cat corporation \\\n",
+ "0 orefice Oresi \n",
+ "1 lavori allo squero Squerarioli \n",
+ "2 fabbricatore di foglie/fili/cordelle d'oro o a... Battioro \n",
+ "\n",
+ " keep_profession_a complete_profession_a ... personal_care_master \\\n",
+ "0 1 1 ... 1 \n",
+ "1 1 1 ... 0 \n",
+ "2 1 1 ... 0 \n",
+ "\n",
+ " clothes_master generic_expenses_master salary_in_kind_master \\\n",
+ "0 1 1 0 \n",
+ "1 0 1 0 \n",
+ "2 0 0 0 \n",
+ "\n",
+ " pledge_goods_master pledge_money_master salary_master female_guarantor \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 1 0 \n",
+ "2 0 0 0 0 \n",
+ "\n",
+ " period_cat incremental_salary \n",
+ "0 NaN 0 \n",
+ "1 1.0 0 \n",
+ "2 NaN 0 \n",
+ "\n",
+ "[3 rows x 47 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['page_title', 'register', 'annual_salary', 'a_profession',\n",
+ " 'profession_code_strict', 'profession_code_gen', 'profession_cat',\n",
+ " 'corporation', 'keep_profession_a', 'complete_profession_a',\n",
+ " 'enrolmentY', 'enrolmentM', 'startY', 'startM', 'length', 'has_fled',\n",
+ " 'm_profession', 'm_profession_code_strict', 'm_profession_code_gen',\n",
+ " 'm_profession_cat', 'm_corporation', 'keep_profession_m',\n",
+ " 'complete_profession_m', 'm_gender', 'm_name', 'm_surname',\n",
+ " 'm_patronimic', 'm_atelier', 'm_coords', 'a_name', 'a_age', 'a_gender',\n",
+ " 'a_geo_origins', 'a_geo_origins_std', 'a_coords', 'a_quondam',\n",
+ " 'accommodation_master', 'personal_care_master', 'clothes_master',\n",
+ " 'generic_expenses_master', 'salary_in_kind_master',\n",
+ " 'pledge_goods_master', 'pledge_money_master', 'salary_master',\n",
+ " 'female_guarantor', 'period_cat', 'incremental_salary'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Every row represents an apprenticeship contract. Contracts were registered both at the guild's and at a public office. This is a sample of contracts from a much larger set of records.\n",
+ "\n",
+ "Some of the variables we will work with are:\n",
+ "* `annual_salary`: the annual salary paid to the apprencice, if any (in Venetian ducats).\n",
+ "* `a_profession` to `corporation`: increasingly generic classifications for the apprentice's stated profession.\n",
+ "* `startY` and `enrolmentY`: contract start and registration year respectively.\n",
+ "* `length`: of the contract, in years.\n",
+ "* `m_gender` and `a_gender`: of master and apprentice respectively.\n",
+ "* `a_age`: age of the apprentice at entry, in years.\n",
+ "* `female_guarantor`: if at least one of the contract's guarantors was female, boolean."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0 9424\n",
+ "0.0 130\n",
+ "Name: m_gender, dtype: int64"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts.m_gender.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Trascrizione \n",
+ " Standard \n",
+ " Gruppo 0 \n",
+ " Gruppo 1 \n",
+ " Gruppo 2 \n",
+ " Gruppo 3 \n",
+ " Gruppo 4 \n",
+ " Corporazione \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " al negotio del libraro \n",
+ " librer \n",
+ " libraio \n",
+ " librai - diverse specializzazioni \n",
+ " stampa \n",
+ " altre lavorazioni manifatturiere \n",
+ " beni \n",
+ " libreri, stampatori e ligadori \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " arte de far arpicordi \n",
+ " arte de far arpicordi \n",
+ " fabbricatore di arpicordi \n",
+ " fabbricatore di strumenti musicali \n",
+ " musica \n",
+ " altri servizi \n",
+ " servizi \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " arte de' colori \n",
+ " arte dei colori \n",
+ " fabbricazione/vendita di colori \n",
+ " colori \n",
+ " colori \n",
+ " decorazioni e mestieri dell'arte \n",
+ " beni \n",
+ " spezieri \n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ " Trascrizione Standard \\\n",
+ "0 al negotio del libraro librer \n",
+ "1 arte de far arpicordi arte de far arpicordi \n",
+ "2 arte de' colori arte dei colori \n",
+ "\n",
+ " Gruppo 0 Gruppo 1 \\\n",
+ "0 libraio librai - diverse specializzazioni \n",
+ "1 fabbricatore di arpicordi fabbricatore di strumenti musicali \n",
+ "2 fabbricazione/vendita di colori colori \n",
+ "\n",
+ " Gruppo 2 Gruppo 3 Gruppo 4 \\\n",
+ "0 stampa altre lavorazioni manifatturiere beni \n",
+ "1 musica altri servizi servizi \n",
+ "2 colori decorazioni e mestieri dell'arte beni \n",
+ "\n",
+ " Corporazione \n",
+ "0 libreri, stampatori e ligadori \n",
+ "1 NaN \n",
+ "2 spezieri "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_professions.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The professions data frame contains a classification system for each profession as found in the records (transcription, first column). The last column is the guild (or corporation) which governed the given profession. This work was performed manually by historians. We don't use it here as the classifications we need are already part of the main dataframe."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Questions\n",
+ "\n",
+ "* Plot the distribution (histogram) of the apprentices' age, contract length, annual salary and start year.\n",
+ "* Calculate the proportion of female apprentices and masters, and of contracts with a female guarantor.\n",
+ "* How likely it is for a female apprentice to have a female master? And for a male apprentice?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAUuElEQVR4nO3df4zcd53f8ecLk4YovpCkSVbGdutU56suP4pRVq6l9KQ1pBcfoDpIF8koJYmgMoqCBKqrnsM/wCFL+eOANoJENRcUp/ywrIM0FpBecy6rFCnB2DSc4/xQrIubc2zFuiOAlz9cbN79Y76+jsxkd7w7u47383xIo5l5z+f7nc/bk7z2u5/5zmyqCklSG952vicgSVo4hr4kNcTQl6SGGPqS1BBDX5Ia8vbzPYGZXHXVVbVq1appx/zqV7/i0ksvXZgJvYXYd1vsuy1z7Xv//v1/V1VXn11/y4f+qlWr2Ldv37RjJicnmZiYWJgJvYXYd1vsuy1z7TvJ/xlUd3lHkhpi6EtSQ2YM/STvSLI3yU+THEzyua7+2SSvJXm2u7y/b5v7khxK8lKSW/vqNyU50D32QJLMT1uSpEGGWdM/Cby3qqaSXAT8MMkT3WNfqqo/6x+c5DpgE3A98C7gr5L8XlWdBh4CNgPPAN8HNgBPIElaEDMe6VfPVHf3ou4y3Rf2bAR2VtXJqnoFOASsTbIMuKyqnq7eF/48Ctw2p9lLks7JUGfvJFkC7Ad+F/hKVf0oyR8Bn0hyJ7AP2FJVbwDL6R3Jn3Gkq/26u312fdDzbab3GwFjY2NMTk5OO7+pqakZxyxG9t0W+27LfPU9VOh3SzNrklwOPJbkBnpLNZ+nd9T/eeALwEeBQev0NU190PNtB7YDjI+P10ynLXlKV1vsuy32PVrndPZOVf0cmAQ2VNXrVXW6qn4DfBVY2w07Aqzs22wFcLSrrxhQlyQtkGHO3rm6O8InySXALcCL3Rr9GR8Cnutu7wY2Jbk4ybXAamBvVR0DTiRZ1521cyfw+OhakSTNZJjlnWXAjm5d/23Arqr6bpL/mmQNvSWaw8DHAarqYJJdwPPAKeDebnkI4B7gEeASemfteObOebJq6/eGGnf4/g/M80wkLaQZQ7+q/hp4z4D6R6bZZhuwbUB9H3DDOc5RkjQifiJXkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSEzhn6SdyTZm+SnSQ4m+VxXvzLJk0le7q6v6NvmviSHkryU5Na++k1JDnSPPZAk89OWJGmQYY70TwLvrap3A2uADUnWAVuBPVW1GtjT3SfJdcAm4HpgA/BgkiXdvh4CNgOru8uG0bUiSZrJjKFfPVPd3Yu6SwEbgR1dfQdwW3d7I7Czqk5W1SvAIWBtkmXAZVX1dFUV8GjfNpKkBfD2YQZ1R+r7gd8FvlJVP0oyVlXHAKrqWJJruuHLgWf6Nj/S1X7d3T67Puj5NtP7jYCxsTEmJyennd/U1NSMYxajufS95cZTQ417K/67+nq3xb5Ha6jQr6rTwJoklwOPJblhmuGD1ulrmvqg59sObAcYHx+viYmJaec3OTnJTGMWo7n0fffW7w017vAds9v/fPL1bot9j9Y5nb1TVT8HJumtxb/eLdnQXR/vhh0BVvZttgI42tVXDKhLkhbIMGfvXN0d4ZPkEuAW4EVgN3BXN+wu4PHu9m5gU5KLk1xL7w3bvd1S0Ikk67qzdu7s20aStACGWd5ZBuzo1vXfBuyqqu8meRrYleRjwKvA7QBVdTDJLuB54BRwb7c8BHAP8AhwCfBEd5EkLZAZQ7+q/hp4z4D63wPve5NttgHbBtT3AdO9HyBJmkd+IleSGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpITOGfpKVSX6Q5IUkB5N8sqt/NslrSZ7tLu/v2+a+JIeSvJTk1r76TUkOdI89kCTz05YkaZC3DzHmFLClqn6S5HeA/Ume7B77UlX9Wf/gJNcBm4DrgXcBf5Xk96rqNPAQsBl4Bvg+sAF4YjStSJJmMuORflUdq6qfdLdPAC8Ay6fZZCOws6pOVtUrwCFgbZJlwGVV9XRVFfAocNtcG5AkDS+9/B1ycLIKeAq4Afj3wN3AL4F99H4beCPJl4Fnqurr3TYP0zuaPwzcX1W3dPU/AP6kqj444Hk20/uNgLGxsZt27tw57bympqZYunTp0H0sFnPp+8Brvxhq3I3L3zmr/c8nX++22PfsrF+/fn9VjZ9dH2Z5B4AkS4FvA5+qql8meQj4PFDd9ReAjwKD1ulrmvpvF6u2A9sBxsfHa2JiYtq5TU5OMtOYxWgufd+99XtDjTt8x+z2P598vdti36M11Nk7SS6iF/jfqKrvAFTV61V1uqp+A3wVWNsNPwKs7Nt8BXC0q68YUJckLZBhzt4J8DDwQlV9sa++rG/Yh4Dnutu7gU1JLk5yLbAa2FtVx4ATSdZ1+7wTeHxEfUiShjDM8s7NwEeAA0me7WqfBj6cZA29JZrDwMcBqupgkl3A8/TO/Lm3O3MH4B7gEeASeuv8nrkjSQtoxtCvqh8yeD3++9Nssw3YNqC+j96bwJKk88BP5EpSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1ZMbQT7IyyQ+SvJDkYJJPdvUrkzyZ5OXu+oq+be5LcijJS0lu7avflORA99gDSTI/bUmSBhnmSP8UsKWqfh9YB9yb5DpgK7CnqlYDe7r7dI9tAq4HNgAPJlnS7eshYDOwurtsGGEvkqQZzBj6VXWsqn7S3T4BvAAsBzYCO7phO4DbutsbgZ1VdbKqXgEOAWuTLAMuq6qnq6qAR/u2kSQtgPTyd8jBySrgKeAG4NWqurzvsTeq6ookXwaeqaqvd/WHgSeAw8D9VXVLV/8D4E+q6oMDnmczvd8IGBsbu2nnzp3TzmtqaoqlS5cO3cdiMZe+D7z2i6HG3bj8nbPa/3zy9W6Lfc/O+vXr91fV+Nn1tw+7gyRLgW8Dn6qqX06zHD/ogZqm/tvFqu3AdoDx8fGamJiYdm6Tk5PMNGYxmkvfd2/93lDjDt8xu/3PJ1/vttj3aA119k6Si+gF/jeq6jtd+fVuyYbu+nhXPwKs7Nt8BXC0q68YUJckLZBhzt4J8DDwQlV9se+h3cBd3e27gMf76puSXJzkWnpv2O6tqmPAiSTrun3e2beNJGkBDLO8czPwEeBAkme72qeB+4FdST4GvArcDlBVB5PsAp6nd+bPvVV1utvuHuAR4BJ66/xPjKYNSdIwZgz9qvohg9fjAd73JttsA7YNqO+j9yawJOk88BO5ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWrI0H9ERReGVUP+cRRJbfJIX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktSQGUM/ydeSHE/yXF/ts0leS/Jsd3l/32P3JTmU5KUkt/bVb0pyoHvsgSQZfTuSpOkMc6T/CLBhQP1LVbWmu3wfIMl1wCbg+m6bB5Ms6cY/BGwGVneXQfuUJM2jGUO/qp4Cfjbk/jYCO6vqZFW9AhwC1iZZBlxWVU9XVQGPArfNcs6SpFmay3fvfCLJncA+YEtVvQEsB57pG3Okq/26u312faAkm+n9VsDY2BiTk5PTTmRqamrGMYvRoL633HhqpM/xVvx39fVui32P1mxD/yHg80B1118APgoMWqevaeoDVdV2YDvA+Ph4TUxMTDuZyclJZhqzGA3q++4Rf+Ha4TsmZhyz0Hy922LfozWrs3eq6vWqOl1VvwG+CqztHjoCrOwbugI42tVXDKhLkhbQrEK/W6M/40PAmTN7dgObklyc5Fp6b9jurapjwIkk67qzdu4EHp/DvCVJszDj8k6SbwETwFVJjgCfASaSrKG3RHMY+DhAVR1Msgt4HjgF3FtVp7td3UPvTKBLgCe6iyRpAc0Y+lX14QHlh6cZvw3YNqC+D7jhnGYnSRopP5ErSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGjKX796R/sGqIb/+4fD9H5jnmUiajkf6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0JakhM4Z+kq8lOZ7kub7alUmeTPJyd31F32P3JTmU5KUkt/bVb0pyoHvsgSQZfTuSpOkMc6T/CLDhrNpWYE9VrQb2dPdJch2wCbi+2+bBJEu6bR4CNgOru8vZ+5QkzbMZQ7+qngJ+dlZ5I7Cju70DuK2vvrOqTlbVK8AhYG2SZcBlVfV0VRXwaN82kqQFMts1/bGqOgbQXV/T1ZcDf9s37khXW97dPrsuSVpAo/5ziYPW6Wua+uCdJJvpLQUxNjbG5OTktE86NTU145jFaFDfW248NdLnGPbfddjnHcXr5OvdFvserdmG/utJllXVsW7p5nhXPwKs7Bu3Ajja1VcMqA9UVduB7QDj4+M1MTEx7WQmJyeZacxiNKjvu4f8W7VDO/CrIQcO95/S4TsmZj2VM3y922LfozXb5Z3dwF3d7buAx/vqm5JcnORaem/Y7u2WgE4kWdedtXNn3zaSpAUy4+FZkm8BE8BVSY4AnwHuB3Yl+RjwKnA7QFUdTLILeB44BdxbVae7Xd1D70ygS4AnuoskaQHNGPpV9eE3eeh9bzJ+G7BtQH0fcMM5zU6SNFJ+IleSGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1JBR/2F0aVqrzuFv+B6+/wPzOBOpTR7pS1JDDH1JaoihL0kNMfQlqSGGviQ1ZE6hn+RwkgNJnk2yr6tdmeTJJC9311f0jb8vyaEkLyW5da6TlySdm1Ec6a+vqjVVNd7d3wrsqarVwJ7uPkmuAzYB1wMbgAeTLBnB80uShjQfyzsbgR3d7R3AbX31nVV1sqpeAQ4Ba+fh+SVJbyJVNfuNk1eAN4AC/ktVbU/y86q6vG/MG1V1RZIvA89U1de7+sPAE1X1FwP2uxnYDDA2NnbTzp07p53H1NQUS5cunXUfF6pBfR947RfnaTajd+Pydw6s+3q3xb5nZ/369fv7VmD+wVw/kXtzVR1Ncg3wZJIXpxmbAbWBP3GqajuwHWB8fLwmJiamncTk5CQzjVmMBvV99zl84vWt7vAdEwPrvt5tse/RmtPyTlUd7a6PA4/RW655PckygO76eDf8CLCyb/MVwNG5PL8k6dzM+kg/yaXA26rqRHf7D4E/BXYDdwH3d9ePd5vsBr6Z5IvAu4DVwN45zL0pg76zZsuNpxbVkb2k+TeX5Z0x4LEkZ/bzzar670l+DOxK8jHgVeB2gKo6mGQX8DxwCri3qk7PafaSpHMy69Cvqr8B3j2g/vfA+95km23Attk+pyRpbvxEriQ1xO/T11vWm333/tnvZfi9+9LwPNKXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BC/huE8e7OvGpCk+eCRviQ1xNCXpIYY+pLUEENfkhpi6EtSQzx7Rxe8Yc+A8o+tSB7pS1JTDH1JasiCL+8k2QD8Z2AJ8OdVdf9Cz0GajstFWswWNPSTLAG+Avxr4Ajw4yS7q+r5hZzHfPNTtpLeqhb6SH8tcKiq/gYgyU5gI7CoQl9vTaP+YbzQP9y33HiKu7vn9LcMzVaqauGeLPljYENV/bvu/keAf1lVnzhr3GZgc3f3nwMvzbDrq4C/G/F0LwT23Rb7bstc+/6nVXX12cWFPtLPgNpv/dSpqu3A9qF3muyrqvG5TOxCZN9tse+2zFffC332zhFgZd/9FcDRBZ6DJDVroUP/x8DqJNcm+UfAJmD3As9Bkpq1oMs7VXUqySeAv6R3yubXqurgCHY99FLQImPfbbHvtsxL3wv6Rq4k6fzyE7mS1BBDX5IackGHfpINSV5KcijJ1vM9n/mU5GtJjid5rq92ZZInk7zcXV9xPuc4aklWJvlBkheSHEzyya6+2Pt+R5K9SX7a9f25rr6o+z4jyZIk/zvJd7v7rfR9OMmBJM8m2dfVRt77BRv6fV/p8EfAdcCHk1x3fmc1rx4BNpxV2wrsqarVwJ7u/mJyCthSVb8PrAPu7V7jxd73SeC9VfVuYA2wIck6Fn/fZ3wSeKHvfit9A6yvqjV95+ePvPcLNvTp+0qHqvq/wJmvdFiUquop4GdnlTcCO7rbO4DbFnJO862qjlXVT7rbJ+gFwXIWf99VVVPd3Yu6S7HI+wZIsgL4APDnfeVF3/c0Rt77hRz6y4G/7bt/pKu1ZKyqjkEvIIFrzvN85k2SVcB7gB/RQN/dEsezwHHgyapqom/gPwH/EfhNX62FvqH3g/1/JNnffRUNzEPvF/JfzhrqKx104UuyFPg28Kmq+mUy6KVfXKrqNLAmyeXAY0luOM9TmndJPggcr6r9SSbO83TOh5ur6miSa4Ank7w4H09yIR/p+5UO8HqSZQDd9fHzPJ+RS3IRvcD/RlV9pysv+r7PqKqfA5P03s9Z7H3fDPybJIfpLde+N8nXWfx9A1BVR7vr48Bj9JawR977hRz6fqVDr9+7utt3AY+fx7mMXHqH9A8DL1TVF/seWux9X90d4ZPkEuAW4EUWed9VdV9VraiqVfT+f/6fVfVvWeR9AyS5NMnvnLkN/CHwHPPQ+wX9idwk76e3BnjmKx22nd8ZzZ8k3wIm6H3d6uvAZ4D/BuwC/gnwKnB7VZ39Zu8FK8m/Av4XcID/v8b7aXrr+ou5739B7027JfQOzHZV1Z8m+ccs4r77dcs7/6GqPthC30n+Gb2je+gtu3+zqrbNR+8XdOhLks7Nhby8I0k6R4a+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1Jasj/A72+cYT5usntAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_contracts.a_age.hist(bins=30)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 9303.000000\n",
+ "mean 14.266688\n",
+ "std 2.902770\n",
+ "min 1.000000\n",
+ "25% 12.000000\n",
+ "50% 14.000000\n",
+ "75% 16.000000\n",
+ "max 50.000000\n",
+ "Name: a_age, dtype: float64"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts.a_age.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAVUElEQVR4nO3dfaxc9Z3f8fdnIWEpDgFKcmXZdE1aNy0PyoOvKFVKZBe0OAmN6QOVI7o4LZVVRFaJmpUwjdRu/7DqbcVKgSzsuiHFNOx63exGtjZiu8jNbVSJhMUJiTGE4gSXePHa3ZAHnEakpt/+MYdosO/1HV/mztzh935Joznznd858z3njj/3zG/mjlNVSJLa8AvjbkCSNDqGviQ1xNCXpIYY+pLUEENfkhpy9rgbmM/FF19cq1atWtC6P/nJTzjvvPOG29AisM/hm5Re7XP4JqXXxe5z3759f1FVbzvljqpa0pc1a9bUQn35y19e8LqjZJ/DNym92ufwTUqvi90n8HjNkqlO70hSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1JCBQj/JBUm+kOTbSZ5O8reTXJTkkSTPdtcX9o2/M8nBJM8kub6vvibJ/u6+u5NkMXZKkjS7Qc/0Pw38cVX9DeBdwNPAFmBvVa0G9na3SXIZsBG4HFgP3JvkrG479wGbgdXdZf2Q9kOSNIB5v4YhyfnA+4GPAlTVz4CfJdkArO2G7QBmgDuADcDOqnoZeC7JQeCqJIeA86vq0W67DwI3Ag8PbW9Osv/PfsRHt3xp3nGHtn1osVqQpCVlkDP9dwD/G/hPSb6R5LNJzgOmquoIQHf99m78CuB7fesf7moruuWT65KkERnkC9fOBt4L/GpVfS3Jp+mmcuYw2zx9naZ+6gaSzfSmgZiammJmZmaANk81dS588soT845b6PaH5fjx42PvYRCT0idMTq/2OXyT0uu4+hwk9A8Dh6vqa93tL9AL/aNJllfVkSTLgWN94y/pW38l8EJXXzlL/RRVtR3YDjA9PV1r164dbG9Ocs9Du7lr//y7eOjmhW1/WGZmZljoPo7SpPQJk9OrfQ7fpPQ6rj7nnd6pqj8HvpfknV3pWuApYA+wqattAnZ3y3uAjUnOSXIpvTdsH+umgF5KcnX3qZ1b+taRJI3AoN+n/6vAQ0neDHwX+Kf0fmHsSnIr8DxwE0BVHUiyi94vhhPA7VX1Sred24AHgHPpvYG7aG/iSpJONVDoV9UTwPQsd107x/itwNZZ6o8DV5xBf5KkIfIvciWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDVkoNBPcijJ/iRPJHm8q12U5JEkz3bXF/aNvzPJwSTPJLm+r76m287BJHcnyfB3SZI0lzM5019XVe+uqunu9hZgb1WtBvZ2t0lyGbARuBxYD9yb5KxunfuAzcDq7rL+9e+CJGlQr2d6ZwOwo1veAdzYV99ZVS9X1XPAQeCqJMuB86vq0aoq4MG+dSRJI5Be/s4zKHkO+AFQwO9U1fYkP6yqC/rG/KCqLkzyGeCrVfX5rn4/8DBwCNhWVdd19WuAO6rqhlkebzO9VwRMTU2t2blz54J27tiLP+LoT+cfd+WKty5o+8Ny/Phxli1bNtYeBjEpfcLk9GqfwzcpvS52n+vWrdvXNzPzc2cPuP77quqFJG8HHkny7dOMnW2evk5TP7VYtR3YDjA9PV1r164dsM3Xuueh3dy1f/5dPHTzwrY/LDMzMyx0H0dpUvqEyenVPodvUnodV58DTe9U1Qvd9THgi8BVwNFuyobu+lg3/DBwSd/qK4EXuvrKWeqSpBGZN/STnJfkLa8uA78MPAnsATZ1wzYBu7vlPcDGJOckuZTeG7aPVdUR4KUkV3ef2rmlbx1J0ggMMr0zBXyx+3Tl2cDvVtUfJ/lTYFeSW4HngZsAqupAkl3AU8AJ4PaqeqXb1m3AA8C59Ob5Hx7ivkiS5jFv6FfVd4F3zVL/PnDtHOtsBbbOUn8cuOLM25QkDYN/kStJDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDRk49JOcleQbSf6ou31RkkeSPNtdX9g39s4kB5M8k+T6vvqaJPu7++5OkuHujiTpdM7kTP/jwNN9t7cAe6tqNbC3u02Sy4CNwOXAeuDeJGd169wHbAZWd5f1r6t7SdIZGSj0k6wEPgR8tq+8AdjRLe8Abuyr76yql6vqOeAgcFWS5cD5VfVoVRXwYN86kqQRSC9/5xmUfAH4d8BbgF+rqhuS/LCqLugb84OqujDJZ4CvVtXnu/r9wMPAIWBbVV3X1a8B7qiqG2Z5vM30XhEwNTW1ZufOnQvauWMv/oijP51/3JUr3rqg7Q/L8ePHWbZs2Vh7GMSk9AmT06t9Dt+k9LrYfa5bt25fVU2fXD97vhWT3AAcq6p9SdYO8FizzdPXaeqnFqu2A9sBpqena+3aQR72VPc8tJu79s+7ixy6eWHbH5aZmRkWuo+jNCl9wuT0ap/DNym9jqvP+RMR3gd8OMkHgV8Ezk/yeeBokuVVdaSbujnWjT8MXNK3/krgha6+cpa6JGlE5p3Tr6o7q2plVa2i9wbtf6uqfwLsATZ1wzYBu7vlPcDGJOckuZTeG7aPVdUR4KUkV3ef2rmlbx1J0ggMcqY/l23AriS3As8DNwFU1YEku4CngBPA7VX1SrfObcADwLn05vkffh2PL0k6Q2cU+lU1A8x0y98Hrp1j3FZg6yz1x4ErzrRJSdJw+Be5ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktSQeUM/yS8meSzJN5McSPJvu/pFSR5J8mx3fWHfOncmOZjkmSTX99XXJNnf3Xd3kizObkmSZjPImf7LwN+tqncB7wbWJ7ka2ALsrarVwN7uNkkuAzYClwPrgXuTnNVt6z5gM7C6u6wf3q5IkuYzb+hXz/Hu5pu6SwEbgB1dfQdwY7e8AdhZVS9X1XPAQeCqJMuB86vq0aoq4MG+dSRJI5Be/s4zqHemvg/4a8BvVdUdSX5YVRf0jflBVV2Y5DPAV6vq8139fuBh4BCwraqu6+rXAHdU1Q2zPN5meq8ImJqaWrNz584F7dyxF3/E0Z/OP+7KFW9d0PaH5fjx4yxbtmysPQxiUvqEyenVPodvUnpd7D7XrVu3r6qmT66fPcjKVfUK8O4kFwBfTHLFaYbPNk9fp6nP9njbge0A09PTtXbt2kHaPMU9D+3mrv3z7+Khmxe2/WGZmZlhofs4SpPSJ0xOr/Y5fJPS67j6PKNP71TVD4EZenPxR7spG7rrY92ww8AlfautBF7o6itnqUuSRmSQT++8rTvDJ8m5wHXAt4E9wKZu2CZgd7e8B9iY5Jwkl9J7w/axqjoCvJTk6u5TO7f0rSNJGoFBpneWAzu6ef1fAHZV1R8leRTYleRW4HngJoCqOpBkF/AUcAK4vZseArgNeAA4l948/8PD3BlJ0unNG/pV9S3gPbPUvw9cO8c6W4Gts9QfB073foAkaRH5F7mS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1JB5Qz/JJUm+nOTpJAeSfLyrX5TkkSTPdtcX9q1zZ5KDSZ5Jcn1ffU2S/d19dyfJ4uyWJGk2g5zpnwA+WVV/E7gauD3JZcAWYG9VrQb2drfp7tsIXA6sB+5Ncla3rfuAzcDq7rJ+iPsiSZrHvKFfVUeq6uvd8kvA08AKYAOwoxu2A7ixW94A7Kyql6vqOeAgcFWS5cD5VfVoVRXwYN86kqQRSC9/BxycrAK+AlwBPF9VF/Td94OqujDJZ4CvVtXnu/r9wMPAIWBbVV3X1a8B7qiqG2Z5nM30XhEwNTW1ZufOnQvauWMv/oijP51/3JUr3rqg7Q/L8ePHWbZs2Vh7GMSk9AmT06t9Dt+k9LrYfa5bt25fVU2fXD970A0kWQb8AfCJqvrxaabjZ7ujTlM/tVi1HdgOMD09XWvXrh20zde456Hd3LV//l08dPPCtj8sMzMzLHQfR2lS+oTJ6dU+h29Seh1XnwN9eifJm+gF/kNV9Ydd+Wg3ZUN3fayrHwYu6Vt9JfBCV185S12SNCKDfHonwP3A01X1m3137QE2dcubgN199Y1JzklyKb03bB+rqiPAS0mu7rZ5S986kqQRGGR6533ArwD7kzzR1f4VsA3YleRW4HngJoCqOpBkF/AUvU/+3F5Vr3Tr3QY8AJxLb57/4eHshiRpEPOGflX9D2afjwe4do51tgJbZ6k/Tu9NYEnSGPgXuZLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDBvmP0d/wVm350kDjDm370CJ3IkmLyzN9SWqIoS9JDTH0Jakhhr4kNcTQl6SGzBv6ST6X5FiSJ/tqFyV5JMmz3fWFfffdmeRgkmeSXN9XX5Nkf3ff3Uky/N2RJJ3OIGf6DwDrT6ptAfZW1Wpgb3ebJJcBG4HLu3XuTXJWt859wGZgdXc5eZuSpEU2b+hX1VeAF08qbwB2dMs7gBv76jur6uWqeg44CFyVZDlwflU9WlUFPNi3jiRpRBY6pz9VVUcAuuu3d/UVwPf6xh3uaiu65ZPrkqQRGvZf5M42T1+nqc++kWQzvakgpqammJmZWVAzU+fCJ688saB1Z7PQPuZz/PjxRdv2ME1KnzA5vdrn8E1Kr+Pqc6GhfzTJ8qo60k3dHOvqh4FL+satBF7o6itnqc+qqrYD2wGmp6dr7dq1C2rynod2c9f+4f1eO3TzwvqYz8zMDAvdx1GalD5hcnq1z+GblF7H1edCp3f2AJu65U3A7r76xiTnJLmU3hu2j3VTQC8lubr71M4tfetIkkZk3tPgJL8HrAUuTnIY+DfANmBXkluB54GbAKrqQJJdwFPACeD2qnql29Rt9D4JdC7wcHeRJI3QvKFfVR+Z465r5xi/Fdg6S/1x4Ioz6k6SNFT+Ra4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQ84edwOTZNWWLw007tC2Dy1yJ5K0MJ7pS1JDDH1JaoihL0kNcU5/EQw69w/O/0sarZGHfpL1wKeBs4DPVtW2UfewlKza8iU+eeUJPjrPLwp/OUgahpFO7yQ5C/gt4APAZcBHklw2yh4kqWWjPtO/CjhYVd8FSLIT2AA8NeI+Js6ZTBkNYtBXDv2PO8grkmE9rqTFkaoa3YMl/whYX1X/vLv9K8DfqqqPnTRuM7C5u/lO4JkFPuTFwF8scN1Rss/hm5Re7XP4JqXXxe7zl6rqbScXR32mn1lqp/zWqartwPbX/WDJ41U1/Xq3s9jsc/gmpVf7HL5J6XVcfY76I5uHgUv6bq8EXhhxD5LUrFGH/p8Cq5NcmuTNwEZgz4h7kKRmjXR6p6pOJPkY8F/pfWTzc1V1YBEf8nVPEY2IfQ7fpPRqn8M3Kb2Opc+RvpErSRovv4ZBkhpi6EtSQ96QoZ9kfZJnkhxMsmXc/bwqySVJvpzk6SQHkny8q/96kj9L8kR3+eC4ewVIcijJ/q6nx7vaRUkeSfJsd33hmHt8Z99xeyLJj5N8Yikc0ySfS3IsyZN9tTmPX5I7u+fsM0muXwK9/ock307yrSRfTHJBV1+V5Kd9x/a3x9znnD/rcR3TOfr8/b4eDyV5oquP9nhW1RvqQu8N4u8A7wDeDHwTuGzcfXW9LQfe2y2/Bfif9L6O4teBXxt3f7P0ewi4+KTavwe2dMtbgN8Yd58n/ez/HPilpXBMgfcD7wWenO/4dc+DbwLnAJd2z+GzxtzrLwNnd8u/0dfrqv5xS+CYzvqzHucxna3Pk+6/C/jX4zieb8Qz/Z9/1UNV/Qx49asexq6qjlTV17vll4CngRXj7eqMbQB2dMs7gBvH18oprgW+U1X/a9yNAFTVV4AXTyrPdfw2ADur6uWqeg44SO+5PBKz9VpVf1JVJ7qbX6X3dzVjNccxncvYjunp+kwS4B8DvzeKXk72Rgz9FcD3+m4fZgkGa5JVwHuAr3Wlj3Uvoz837imTPgX8SZJ93VdjAExV1RHo/RID3j627k61kdf+Q1qKx3Su47fUn7f/DHi47/alSb6R5L8nuWZcTfWZ7We9VI/pNcDRqnq2rzay4/lGDP2BvuphnJIsA/4A+ERV/Ri4D/irwLuBI/Re+i0F76uq99L7VtTbk7x/3A3Npftjvw8D/6UrLdVjOpcl+7xN8ingBPBQVzoC/JWqeg/wL4HfTXL+uPpj7p/1Uj2mH+G1JycjPZ5vxNBf0l/1kORN9AL/oar6Q4CqOlpVr1TV/wP+IyN8WX86VfVCd30M+CK9vo4mWQ7QXR8bX4ev8QHg61V1FJbuMWXu47ckn7dJNgE3ADdXNwHdTZd8v1veR2+u/K+Pq8fT/KyX3DFNcjbwD4Dff7U26uP5Rgz9JftVD91c3v3A01X1m3315X3D/j7w5MnrjlqS85K85dVlem/qPUnvWG7qhm0Cdo+nw1O85uxpKR7TzlzHbw+wMck5SS4FVgOPjaG/n0vvPzy6A/hwVf2fvvrb0vu/MUjyDnq9fnc8XZ72Z73kjilwHfDtqjr8amHkx3NU7xiP8gJ8kN4nY74DfGrc/fT19Xfovbz8FvBEd/kg8J+B/V19D7B8CfT6DnqffPgmcODV4wj8ZWAv8Gx3fdES6PUvAd8H3tpXG/sxpfdL6Ajwf+mddd56uuMHfKp7zj4DfGAJ9HqQ3pz4q8/V3+7G/sPuOfFN4OvA3xtzn3P+rMd1TGfrs6s/APyLk8aO9Hj6NQyS1JA34vSOJGkOhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyP8HoM4Qkzxoov4AAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_contracts.annual_salary.hist(bins=30)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 7870.000000\n",
+ "mean 5.916921\n",
+ "std 6.985214\n",
+ "min 0.166667\n",
+ "25% 3.000000\n",
+ "50% 4.000000\n",
+ "75% 6.000000\n",
+ "max 180.000000\n",
+ "Name: annual_salary, dtype: float64"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts.annual_salary.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAD4CAYAAADo30HgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAVkklEQVR4nO3df4wc91nH8feD3Qa318YOoYexLRyQVUhifsSnECit7pRCTBPVARHkKlAHgqyitATkSnWoRPnHwoCC1BJSZHBUl0S9mjQlJqmhkclRIdUJcUh7cdw0LjGpE9eGNkl7JQpcePhjx7Df697d7s7u3l38fkmnnf3Od2aenZ3bz82PnYvMRJKks75roQuQJC0uBoMkqWAwSJIKBoMkqWAwSJIKyxe6gPlceOGFuX79+rb7f/vb3+b1r399/wqqyfrqsb56rK+epVTfkSNH/iMzv7erGWXmov7ZtGlTduLBBx/sqP+gWV891leP9dWzlOoDHskuP3c9lCRJKhgMkqSCwSBJKhgMkqSCwSBJKhgMkqSCwSBJKhgMkqSCwSBJKiz6W2K8mqzfeT87Nk5zw877257mxO6r+1iRJH0n9xgkSQWDQZJUMBgkSQWDQZJUMBgkSQWDQZJUMBgkSQWDQZJUMBgkSQWDQZJUmDcYIuKOiDgTEY83tf1xRHwpIr4YEZ+OiJVN426JiOMR8WREXNXUvikiJqtxH4mI6PmrkSTV1s4ew8eAzTPaHgAuzcwfBb4M3AIQERcDW4FLqmluj4hl1TQfBbYDG6qfmfOUJC0C8wZDZn4O+MaMts9m5nT19DCwthreAoxn5suZ+TRwHLg8IlYDb8zMz2dmAh8Hru3Ra5Ak9VA0Pqfn6RSxHrgvMy9tMe5vgU9m5p0RcRtwODPvrMbtBQ4CJ4Ddmfn2qv2twAcy85pZlredxt4Fw8PDm8bHx9t+QVNTUwwNDbXdf5Amn32R4RVw+qX2p9m45vz+FdTCYl5/YH11WV89S6m+sbGxI5k50s18at12OyI+CEwDd51tatEt52hvKTP3AHsARkZGcnR0tO2aJiYm6KT/IN1Q3Xb71sn2V/uJ60f7V1ALi3n9gfXVZX31nCv1dR0MEbENuAa4Mv9/t+MksK6p21rguap9bYt2SdIi09XlqhGxGfgA8M7M/M+mUQeArRFxXkRcROMk88OZeQr4VkRcUV2N9G7g3pq1S5L6YN49hoj4BDAKXBgRJ4EP0bgK6Tzggeqq08OZ+Z7MPBoR+4EnaBxiuikzX6lm9Zs0rnBaQeO8w8HevhRJUi/MGwyZ+a4WzXvn6L8L2NWi/RHgO05eS5IWF7/5LEkq1LoqSYvP+p33d9T/xO6r+1SJpKXKPQZJUsFgkCQVDAZJUsFgkCQVDAZJUsFgkCQVDAZJUsFgkCQVDAZJUsFgkCQVDAZJUsFgkCQVDAZJUsFgkCQVDAZJUsFgkCQVDAZJUsFgkCQVDAZJUsFgkCQVDAZJUmHeYIiIOyLiTEQ83tR2QUQ8EBFPVY+rmsbdEhHHI+LJiLiqqX1TRExW4z4SEdH7lyNJqqudPYaPAZtntO0EDmXmBuBQ9ZyIuBjYClxSTXN7RCyrpvkosB3YUP3MnKckaRGYNxgy83PAN2Y0bwH2VcP7gGub2scz8+XMfBo4DlweEauBN2bm5zMzgY83TSNJWkSi8Tk9T6eI9cB9mXlp9fyFzFzZNP75zFwVEbcBhzPzzqp9L3AQOAHszsy3V+1vBT6QmdfMsrztNPYuGB4e3jQ+Pt72C5qammJoaKjt/oM0+eyLDK+A0y+1P83GNed3vIxOzJz/Yl5/YH11WV89S6m+sbGxI5k50s18lve0Kmh13iDnaG8pM/cAewBGRkZydHS07QImJibopP8g3bDzfnZsnObWyfZX+4nrRzteRidmzn8xrz+wvrqsr55zpb5ur0o6XR0eono8U7WfBNY19VsLPFe1r23RLklaZLoNhgPAtmp4G3BvU/vWiDgvIi6icZL54cw8BXwrIq6orkZ6d9M0kqRFZN5jGhHxCWAUuDAiTgIfAnYD+yPiRuAZ4DqAzDwaEfuBJ4Bp4KbMfKWa1W/SuMJpBY3zDgd7+kokST0xbzBk5rtmGXXlLP13AbtatD8CXNpRdZKkgfObz5KkQq+vStISs37GVUw7Nk7PeWXTid1X97skSQvMPQZJUsFgkCQVDAZJUsFzDDXMPD4vSa8G7jFIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgq1giEificijkbE4xHxiYj47oi4ICIeiIinqsdVTf1viYjjEfFkRFxVv3xJUq91HQwRsQb4LWAkMy8FlgFbgZ3AoczcAByqnhMRF1fjLwE2A7dHxLJ65UuSeq3uoaTlwIqIWA68DngO2ALsq8bvA66thrcA45n5cmY+DRwHLq+5fElSj0Vmdj9xxM3ALuAl4LOZeX1EvJCZK5v6PJ+ZqyLiNuBwZt5Zte8FDmbm3S3mux3YDjA8PLxpfHy87ZqmpqYYGhrq+jV1YvLZFzueZngFnH6p/f4b15zf0fy7qanZfPV1Wk+vDfL97Yb11WN99TTXNzY2diQzR7qZz/JuC6jOHWwBLgJeAP46In5lrklatLVMpczcA+wBGBkZydHR0bbrmpiYoJP+ddyw8/6Op9mxcZpbJ9tf7SeuH+1o/t3U1Gy++jqtp9cG+f52w/rqsb56elVfnUNJbweezsx/z8z/Bu4Bfho4HRGrAarHM1X/k8C6punX0jj0JElaROoEwzPAFRHxuogI4ErgGHAA2Fb12QbcWw0fALZGxHkRcRGwAXi4xvIlSX3Q9aGkzHwoIu4GHgWmgX+hcfhnCNgfETfSCI/rqv5HI2I/8ETV/6bMfKVm/ZKkHus6GAAy80PAh2Y0v0xj76FV/100TlZriVrf4TmME7uv7lMlkvrFbz5LkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpUOsmeuq/Tm9aJ0l1uccgSSoYDJKkgsEgSSoYDJKkgsEgSSoYDJKkgsEgSSoYDJKkgsEgSSoYDJKkQq1giIiVEXF3RHwpIo5FxE9FxAUR8UBEPFU9rmrqf0tEHI+IJyPiqvrlS5J6re4ew4eBv8vMHwZ+DDgG7AQOZeYG4FD1nIi4GNgKXAJsBm6PiGU1ly9J6rGugyEi3gi8DdgLkJn/lZkvAFuAfVW3fcC11fAWYDwzX87Mp4HjwOXdLl+S1B+Rmd1NGPHjwB7gCRp7C0eAm4FnM3NlU7/nM3NVRNwGHM7MO6v2vcDBzLy7xby3A9sBhoeHN42Pj7dd19TUFENDQ129pk5NPvtix9MMr4DTL/WhmB7pdX0b15zfu5kx2Pe3G9ZXj/XV01zf2NjYkcwc6WY+dW67vRy4DHhfZj4UER+mOmw0i2jR1jKVMnMPjdBhZGQkR0dH2y5qYmKCTvrXcUMXt8TesXGaWycX793Oe13fietHezYvGOz72w3rq8f66ulVfXXOMZwETmbmQ9Xzu2kExemIWA1QPZ5p6r+uafq1wHM1li9J6oOugyEzvwZ8NSLeXDVdSeOw0gFgW9W2Dbi3Gj4AbI2I8yLiImAD8HC3y5ck9UfdYwbvA+6KiNcC/wr8Go2w2R8RNwLPANcBZObRiNhPIzymgZsy85Way5ck9VitYMjMx4BWJzeunKX/LmBXnWVKkvrLbz5LkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgoGgySpYDBIkgq1gyEilkXEv0TEfdXzCyLigYh4qnpc1dT3log4HhFPRsRVdZctSeq9Xuwx3Awca3q+EziUmRuAQ9VzIuJiYCtwCbAZuD0ilvVg+ZKkHqoVDBGxFrga+Mum5i3Avmp4H3BtU/t4Zr6cmU8Dx4HL6yxfktR7kZndTxxxN/AHwBuA92fmNRHxQmaubOrzfGauiojbgMOZeWfVvhc4mJl3t5jvdmA7wPDw8Kbx8fG2a5qammJoaKjr19SJyWdf7Hia4RVw+qU+FNMjva5v45rzezczBvv+dsP66rG+eprrGxsbO5KZI93MZ3m3BUTENcCZzDwSEaPtTNKirWUqZeYeYA/AyMhIjo62M/uGiYkJOulfxw077+94mh0bp7l1suvV3ne9ru/E9aM9mxcM9v3thvXVY3319Kq+Op8AbwHeGRHvAL4beGNE3AmcjojVmXkqIlYDZ6r+J4F1TdOvBZ6rsXxJUh90fY4hM2/JzLWZuZ7GSeV/yMxfAQ4A26pu24B7q+EDwNaIOC8iLgI2AA93XbkkqS/6cUxjN7A/Im4EngGuA8jMoxGxH3gCmAZuysxX+rB8SVINPQmGzJwAJqrhrwNXztJvF7CrF8uUJPWH33yWJBUMBklSYfFeN6lXhfUdXtJ7YvfVfapEUrvcY5AkFQwGSVLBYJAkFQwGSVLBYJAkFQwGSVLBYJAkFQwGSVLBYJAkFQwGSVLBYJAkFbxXUpNO7+sjSa9G7jFIkgoGgySpYDBIkgoGgySpYDBIkgpelSTNo5Or1XZsnGa0f6VIA2EwaMnz34dKveWhJElSoetgiIh1EfFgRByLiKMRcXPVfkFEPBART1WPq5qmuSUijkfEkxFxVS9egCSpt+rsMUwDOzLzR4ArgJsi4mJgJ3AoMzcAh6rnVOO2ApcAm4HbI2JZneIlSb3XdTBk5qnMfLQa/hZwDFgDbAH2Vd32AddWw1uA8cx8OTOfBo4Dl3e7fElSf/TkHENErAd+AngIGM7MU9AID+BNVbc1wFebJjtZtUmSFpHIzHoziBgC/hHYlZn3RMQLmbmyafzzmbkqIv4M+Hxm3lm17wU+k5mfajHP7cB2gOHh4U3j4+Nt1zM1NcXQ0FBXr2Xy2Re7mq4Twyvg9Et9X0zXFrq+jWvOn3N8q/e30/dtvmXM1Mn8h1fAmy7obP6DVOf3YxCsr57m+sbGxo5k5kg386l1uWpEvAb4FHBXZt5TNZ+OiNWZeSoiVgNnqvaTwLqmydcCz7Wab2buAfYAjIyM5OjoaNs1TUxM0En/ZjcM4O6qOzZOc+vk4r1KeKHrO3H96JzjW72/nb5v8y1jpk7mv2PjNL/c5fY3CHV+PwbB+urpVX11rkoKYC9wLDP/pGnUAWBbNbwNuLepfWtEnBcRFwEbgIe7Xb4kqT/q/Gn4FuBXgcmIeKxq+11gN7A/Im4EngGuA8jMoxGxH3iCxhVNN2XmKzWWL0nqg66DITP/CYhZRl85yzS7gF3dLlOS1H+L92C3zknz3d5ix8bpgZwLks5l3hJDklQwGCRJBYNBklQwGCRJBYNBklQwGCRJBYNBklTwewzSEuS/M1U/uccgSSoYDJKkgsEgSSoYDJKkgiefdc7p9MStdK5xj0GSVDAYJEkFg0GSVPAcg6SBm+08z2z/iMkv6A2WewySpILBIEkqGAySpILnGKRzwNlj+rMdw5+p02P6/f5uSDfz97xE917VweAXmSSpc6/qYJCWAv+A0WIz8GCIiM3Ah4FlwF9m5u5B1yBpbobV/F7N/xNjoMEQEcuAPwN+FjgJ/HNEHMjMJwZZh9RPfqguDq/mD+5+G/Qew+XA8cz8V4CIGAe2AAaDpAXVTpC0e/K+2/k3W8igiswc3MIifgnYnJm/UT3/VeAnM/O9M/ptB7ZXT98MPNnBYi4E/qMH5faL9dVjffVYXz1Lqb4fyMzv7WYmg95jiBZt35FMmbkH2NPVAiIeycyRbqYdBOurx/rqsb56zpX6Bv0Ft5PAuqbna4HnBlyDJGkOgw6GfwY2RMRFEfFaYCtwYMA1SJLmMNBDSZk5HRHvBf6exuWqd2Tm0R4vpqtDUANkffVYXz3WV885Ud9ATz5LkhY/b6InSSoYDJKkwpINhojYHBFPRsTxiNjZYnxExEeq8V+MiMsGWNu6iHgwIo5FxNGIuLlFn9GIeDEiHqt+fm9Q9VXLPxERk9WyH2kxfiHX35ub1stjEfHNiPjtGX0Guv4i4o6IOBMRjze1XRARD0TEU9XjqlmmnXNb7WN9fxwRX6rev09HxMpZpp1zW+hjfb8fEc82vYfvmGXahVp/n2yq7UREPDbLtINYfy0/U/q2DWbmkvuhceL6K8APAq8FvgBcPKPPO4CDNL47cQXw0ADrWw1cVg2/Afhyi/pGgfsWcB2eAC6cY/yCrb8W7/XXaHxZZ8HWH/A24DLg8aa2PwJ2VsM7gT+cpf45t9U+1vdzwPJq+A9b1dfOttDH+n4feH8b7/+CrL8Z428Ffm8B11/Lz5R+bYNLdY/h/26tkZn/BZy9tUazLcDHs+EwsDIiVg+iuMw8lZmPVsPfAo4Bawax7B5asPU3w5XAVzLz3xZg2f8nMz8HfGNG8xZgXzW8D7i2xaTtbKt9qS8zP5uZ09XTwzS+N7QgZll/7Viw9XdWRATwy8Aner3cds3xmdKXbXCpBsMa4KtNz0/ynR+87fTpu4hYD/wE8FCL0T8VEV+IiIMRcclgKyOBz0bEkWjcgmSmRbH+aHzXZbZfyIVcfwDDmXkKGr+4wJta9Fks6/HXaewBtjLfttBP760Odd0xy2GQxbD+3gqczsynZhk/0PU34zOlL9vgUg2Gdm6t0dbtN/opIoaATwG/nZnfnDH6URqHR34M+FPgbwZZG/CWzLwM+Hngpoh424zxi2H9vRZ4J/DXLUYv9Ppr12JYjx8EpoG7Zuky37bQLx8Ffgj4ceAUjcM1My34+gPexdx7CwNbf/N8psw6WYu2OdfhUg2Gdm6tsaC334iI19B4A+/KzHtmjs/Mb2bmVDX8GeA1EXHhoOrLzOeqxzPAp2nsbjZbDLcv+Xng0cw8PXPEQq+/yumzh9eqxzMt+iz0drgNuAa4PqsDzjO1sS30RWaezsxXMvN/gL+YZbkLvf6WA78IfHK2PoNaf7N8pvRlG1yqwdDOrTUOAO+urq65Anjx7C5Xv1XHJPcCxzLzT2bp831VPyLichrvxdcHVN/rI+INZ4dpnKR8fEa3BVt/TWb9S20h11+TA8C2angbcG+LPgt2G5ho/FOsDwDvzMz/nKVPO9tCv+prPmf1C7Msd6Fvo/N24EuZebLVyEGtvzk+U/qzDfbzTHo/f2hcNfNlGmfbP1i1vQd4TzUcNP4p0FeASWBkgLX9DI1dtS8Cj1U/75hR33uBozSuEDgM/PQA6/vBarlfqGpYVOuvWv7raHzQn9/UtmDrj0ZAnQL+m8ZfYDcC3wMcAp6qHi+o+n4/8Jm5ttUB1XecxrHls9vgn8+sb7ZtYUD1/VW1bX2RxgfV6sW0/qr2j53d5pr6LsT6m+0zpS/boLfEkCQVluqhJElSnxgMkqSCwSBJKhgMkqSCwSBJKhgMkqSCwSBJKvwvlUId3avGll4AAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_contracts[df_contracts.annual_salary < 20].annual_salary.hist(bins=25)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAATvklEQVR4nO3dfYxl9X3f8fenYBPCmgd3lRVmqZZKUImHxO1ONlQu7aztBmJbAquxtA41ILvaCOEqUUnLklYKkbXVNsrGFaJGXQvLUFKPaGwHZCAIo0wIEZTsEuJlIchr74ouILaOycLQlGTX3/5xz1Z3x/O0s3Pn3pnf+yWN5p7vPU/f+/CZM7975kyqCklSG/7OsHdAkrR8DH1JaoihL0kNMfQlqSGGviQ15PRh78B81q5dWxs2bBjIut955x3OOuusgax7VKz2Hu1v5VvtPQ6jv7Vr1/LYY489VlXXTL9v5EN/w4YN7N69eyDrnpycZHx8fCDrHhWrvUf7W/lWe4/D6i/J2pnqDu9IUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDRv4vcleiDdseHtq2D+74+NC2LWn0zXukn+TCJH+Y5KUk+5L8Sle/I8mrSZ7vvj7Wt8ztSfYneTnJ1X31jUn2dvfdmSSDaUuSNJOFHOkfBW6tqueSvA/Yk+Tx7r4vVtVv98+c5FJgC3AZ8AHg20kuqapjwN3AVuAZ4BHgGuDRpWlFkjSfeY/0q+r1qnquu/028BJwwRyLXAtMVNW7VXUA2A9sSnI+cHZVPV29f8x7H3DdqTYgSVq4nMw/Rk+yAXgSuBz4N8BNwFvAbnq/DbyZ5C7gmaq6v1vmHnpH8weBHVX10a5+FXBbVX1ihu1spfcbAevWrds4MTGxyPbmNjU1xZo1a5Z8vXtfPbLk61yoKy4454TpQfU4Kuxv5VvtPQ6rv82bN++pqrHp9QV/kJtkDfB14Fer6q0kdwNfAKr7vhP4LDDTOH3NUf/xYtUuYBfA2NhYDeqypIO65OlNw/wg9/rxE6a9bO3Kttr7g9Xf46j1t6BTNpO8h17g/25VfQOgqt6oqmNV9SPgy8CmbvZDwIV9i68HXuvq62eoS5KWyULO3glwD/BSVf1OX/38vtk+CbzQ3X4I2JLkjCQXARcDz1bV68DbSa7s1nkD8OAS9SFJWoCFDO98CPgMsDfJ813t14FPJ/kgvSGag8AvA1TVviQPAC/SO/Pnlu7MHYCbga8CZ9Ib5/fMHUlaRvOGflU9xczj8Y/Mscx2YPsM9d30PgSWJA2Bl2GQpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNWTe0E9yYZI/TPJSkn1JfqWrvz/J40m+230/r2+Z25PsT/Jykqv76huT7O3uuzNJBtOWJGkmpy9gnqPArVX1XJL3AXuSPA7cBDxRVTuSbAO2AbcluRTYAlwGfAD4dpJLquoYcDewFXgGeAS4Bnh0qZtSWzZse3go2z244+ND2a50KuY90q+q16vque7228BLwAXAtcC93Wz3Atd1t68FJqrq3ao6AOwHNiU5Hzi7qp6uqgLu61tGkrQM0svfBc6cbACeBC4HXqmqc/vue7OqzktyF/BMVd3f1e+hdzR/ENhRVR/t6lcBt1XVJ2bYzlZ6vxGwbt26jRMTE4tqbj5TU1OsWbNmyde799UjS77OhbrignNOmB5Uj6NiamqKA0eODWXb0x/rQVjtzx+s/h6H1d/mzZv3VNXY9PpChncASLIG+Drwq1X11hzD8TPdUXPUf7xYtQvYBTA2Nlbj4+ML3c2TMjk5ySDWfdOQhhsADl4/fsL0oHocFZOTk+x86p2hbHv6Yz0Iq/35g9Xf46j1t6Czd5K8h17g/25VfaMrv9EN2dB9P9zVDwEX9i2+Hnitq6+foS5JWiYLOXsnwD3AS1X1O313PQTc2N2+EXiwr74lyRlJLgIuBp6tqteBt5Nc2a3zhr5lJEnLYCHDOx8CPgPsTfJ8V/t1YAfwQJLPAa8AnwKoqn1JHgBepHfmzy3dmTsANwNfBc6kN87vmTuStIzmDf2qeoqZx+MBPjLLMtuB7TPUd9P7EFiSNAT+Ra4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGrLgf6IiaXT4f4G1WB7pS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyLyhn+QrSQ4neaGvdkeSV5M83319rO++25PsT/Jykqv76huT7O3uuzNJlr4dSdJcFnKk/1XgmhnqX6yqD3ZfjwAkuRTYAlzWLfOlJKd1898NbAUu7r5mWqckaYDmDf2qehL44QLXdy0wUVXvVtUBYD+wKcn5wNlV9XRVFXAfcN0i91mStEjpZfA8MyUbgG9V1eXd9B3ATcBbwG7g1qp6M8ldwDNVdX833z3Ao8BBYEdVfbSrXwXcVlWfmGV7W+n9VsC6des2TkxMLL7DOUxNTbFmzZolX+/eV48s+ToX6ooLzjlhelA9joqpqSkOHDk2lG1Pf6wHYbbnb1ivsUH03MJrdBj9bd68eU9VjU2vn77I9d0NfAGo7vtO4LPATOP0NUd9RlW1C9gFMDY2VuPj44vczblNTk4yiHXftO3hJV/nQh28fvyE6UH1OComJyfZ+dQ7Q9n29Md6EGZ7/ob1GhtEzy28Rkepv0WdvVNVb1TVsar6EfBlYFN31yHgwr5Z1wOvdfX1M9QlSctoUaHfjdEf90ng+Jk9DwFbkpyR5CJ6H9g+W1WvA28nubI7a+cG4MFT2G9J0iLMO7yT5GvAOLA2ySHgN4DxJB+kN0RzEPhlgKral+QB4EXgKHBLVR0fcL2Z3plAZ9Ib5390CfuQJC3AvKFfVZ+eoXzPHPNvB7bPUN8NXH5SeydJWlL+Ra4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhoy7z9Gl6TjNmx7eMnXeesVR7lpnvUe3PHxJd9uqzzSl6SGrOoj/fmOShZyhCFJq4lH+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1JB5Qz/JV5IcTvJCX+39SR5P8t3u+3l9992eZH+Sl5Nc3VffmGRvd9+dSbL07UiS5rKQI/2vAtdMq20Dnqiqi4EnummSXApsAS7rlvlSktO6Ze4GtgIXd1/T1ylJGrB5Q7+qngR+OK18LXBvd/te4Lq++kRVvVtVB4D9wKYk5wNnV9XTVVXAfX3LSJKWSXoZPM9MyQbgW1V1eTf9V1V1bt/9b1bVeUnuAp6pqvu7+j3Ao8BBYEdVfbSrXwXcVlWfmGV7W+n9VsC6des2TkxMLKq5va8emfP+dWfCG3+9qFWPrCsuOOeE6ampKdasWTOkvRm8qakpDhw5NpRtT3+sB2G252++1/ZKspD34XI81oMyrPfg5s2b91TV2PT6Ul9lc6Zx+pqjPqOq2gXsAhgbG6vx8fFF7cx8V9C89Yqj7Ny7ui40evD68ROmJycnWezjtxJMTk6y86l3hrLt6Y/1IMz2/K2mq8Mu5H24HI/1oIzae3CxZ++80Q3Z0H0/3NUPARf2zbceeK2rr5+hLklaRosN/YeAG7vbNwIP9tW3JDkjyUX0PrB9tqpeB95OcmV31s4NfctIkpbJvGMbSb4GjANrkxwCfgPYATyQ5HPAK8CnAKpqX5IHgBeBo8AtVXV8wPVmemcCnUlvnP/RJe1EkjSveUO/qj49y10fmWX+7cD2Geq7gctPau8kSUvKv8iVpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNOX3YOyBJ89mw7eGhbfvgjo8PbduD4JG+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGnFPpJDibZm+T5JLu72vuTPJ7ku9338/rmvz3J/iQvJ7n6VHdeknRyluJIf3NVfbCqxrrpbcATVXUx8EQ3TZJLgS3AZcA1wJeSnLYE25ckLdAghneuBe7tbt8LXNdXn6iqd6vqALAf2DSA7UuSZpGqWvzCyQHgTaCA/1pVu5L8VVWd2zfPm1V1XpK7gGeq6v6ufg/waFX93gzr3QpsBVi3bt3GiYmJRe3f3lePzHn/ujPhjb9e1KpH1hUXnHPC9NTUFGvWrBnS3gze1NQUB44cG8q2pz/WgzDb8zffa3slGfX34ak+z8N6D27evHlP3wjM/3eqF1z7UFW9luSngMeT/MUc82aG2ow/capqF7ALYGxsrMbHxxe1czfNc5GmW684ys69q+uacwevHz9henJyksU+fivB5OQkO596Zyjbnv5YD8Jsz998r+2VZNTfh6f6PI/ae/CUhneq6rXu+2Hgm/SGa95Icj5A9/1wN/sh4MK+xdcDr53K9iVJJ2fRoZ/krCTvO34b+HngBeAh4MZuthuBB7vbDwFbkpyR5CLgYuDZxW5fknTyTuV3qnXAN5McX89/r6o/SPKnwANJPge8AnwKoKr2JXkAeBE4CtxSVcMZjJWkRi069Kvq+8DPzFD/S+AjsyyzHdi+2G1Kkk6Nf5ErSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGLPofo0ut27Dt4YFv49YrjnLTMmxH7fBIX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1Jasiyh36Sa5K8nGR/km3LvX1Jatmyhn6S04D/AvwCcCnw6SSXLuc+SFLLlvtIfxOwv6q+X1V/A0wA1y7zPkhSs1JVy7ex5BeBa6rqX3XTnwF+rqo+P22+rcDWbvIfAC8PaJfWAj8Y0LpHxWrv0f5WvtXe4zD6+wFAVV0z/Y7lvp5+Zqj92E+dqtoF7Br4ziS7q2ps0NsZptXeo/2tfKu9x1Hrb7mHdw4BF/ZNrwdeW+Z9kKRmLXfo/ylwcZKLkrwX2AI8tMz7IEnNWtbhnao6muTzwGPAacBXqmrfcu7DNAMfQhoBq71H+1v5VnuPI9Xfsn6QK0kaLv8iV5IaYuhLUkNWXegn+UqSw0le6KvdkeTVJM93Xx/r6u9Jcm+SvUleSnJ73zIbu/r+JHcmmel002U3U39d/V93l7fYl+S3+uq3dz28nOTqvvqK7y/JP0+yp+tjT5IP980/kv3ByT+H3X1/L8lUkl/rq41kj4t4jf50kqe7+t4kP9HVR7I/OOnX6WjlTFWtqi/gnwL/CHihr3YH8GszzPtLwER3+yeBg8CGbvpZ4B/T+9uCR4FfGHZvc/S3Gfg2cEY3/VPd90uBPwfOAC4Cvgector6+4fAB7rblwOv9i0zkv2dbI99938d+B/9r+NR7fEkn8PTge8AP9NN/91Rf40uoseRyplVd6RfVU8CP1zo7MBZSU4HzgT+BngryfnA2VX1dPWemfuA6waxvydrlv5uBnZU1bvdPIe7+rX0XmzvVtUBYD+wabX0V1V/VlXH/85jH/ATSc4Y5f7gpJ9DklwHfJ9ej8drI9vjSfb388B3qurPu/pfVtWxUe4PTrrHkcqZVRf6c/h8ku90v5ad19V+D3gHeB14BfjtqvohcAG9PyQ77lBXG1WXAFcl+Z9J/ijJz3b1C4D/1Tff8T5WS3/9/gXwZ90bbqX1B7P0mOQs4DbgN6fNv9J6nO05vASoJI8leS7Jv+vqK60/mL3HkcqZ5b4Mw7DcDXyB3k/cLwA7gc/SuwDcMeADwHnAHyf5Ngu8XMQIOZ3e/l8J/CzwQJK/z+x9rIr+uqMjklwG/Cd6R42w8vqD2Z/D3wS+WFVT04Z7V1qPs/V3OvBPutr/AZ5Isgd4a4Z1jHJ/MHuPI5UzTYR+Vb1x/HaSLwPf6iZ/CfiDqvpb4HCSPwHGgD+md4mI40b9chGHgG90Ifhskh/Ru8jTbJe9OMTq6O9/J1kPfBO4oaq+1zf/SuoPZu/x54Bf7D4UPBf4UZL/S2+MfyX1ONdr9I+q6gcASR6hN1Z+PyurP5i9x5HKmSaGd7qxs+M+CRz/xP0V4MPpOYveT+i/qKrXgbeTXNl9mn4D8OCy7vTJ+X3gwwBJLgHeS+8qew8BW7px7ouAi4FnV0t/Sc4FHgZur6o/OT7zCuwPZumxqq6qqg1VtQH4z8B/rKq7VmCPv8/Mr9HHgJ9O8pPdmPc/A15cgf3B7D2OVs4M+pPi5f4CvkZv7Oxv6f3k/Rzw34C99M4SeAg4v5t3Db0zIvYBLwL/tm89Y/R+OHwPuIvur5eH/TVLf++ld2T0AvAc8OG++f9918PL9J0ZsBr6A/4DvbHS5/u+jp8xMZL9LeY57FvuDk48e2cke1zEa/Rfdu/BF4DfGvX+FvE6Hamc8TIMktSQJoZ3JEk9hr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyP8DDE9W+GtusxEAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_contracts.startY.hist(bins=10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0 0.987293\n",
+ "0.0 0.012707\n",
+ "Name: a_gender, dtype: float64"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts.a_gender.value_counts(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9401.0"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts.a_gender.sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9401.0"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts.a_gender.sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sum([True,False,False,True,True])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.026105873821609893"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "1-(df_contracts.a_gender.sum()/df_contracts.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.6105873821609893"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# proportion of female apprentices\n",
+ "(1-(df_contracts.a_gender.sum()/df_contracts.shape[0]))*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2.3723194861701047"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# proportion of female masters\n",
+ "(1-(df_contracts.m_gender.sum()/df_contracts.shape[0]))*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "73.10924369747899"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# prop female apprentices with male master\n",
+ "(df_contracts[(df_contracts.a_gender == 0) & (df_contracts.startY < 1800)].m_gender.sum()\\\n",
+ " /df_contracts[(df_contracts.a_gender == 0) & (df_contracts.startY < 1800)].shape[0])*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "98.10528582193993 %\n"
+ ]
+ }
+ ],
+ "source": [
+ "# prop male apprentices with male master\n",
+ "print((df_contracts[(df_contracts.a_gender == 1) & (df_contracts.startY < 1800)].m_gender.sum()\\\n",
+ " /df_contracts[(df_contracts.a_gender == 1) & (df_contracts.startY < 1800)].shape[0])*100,\"%\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Looking at empirical distributions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAT2ElEQVR4nO3df4zk9X3f8eerh0OpiRM74NXlDveIdLbCj+QqVhTJbbTESbkYK+AqTg5RA7Wrsy0s2epVLaSR7MY6CbVx3FqJic4GgWWXCwqxQcEkITQrXAmK72zi44epD3OxlzvdySY1rGNdc/jdP+a7ZbLs7e7M7M3CfJ4PaTQzn+/38/1+5s3x2u985jvfSVUhSWrDP1jvAUiSxsfQl6SGGPqS1BBDX5IaYuhLUkNOW+8BrOSss86qLVu2DNzvBz/4Aa997WvXfkCvMtbhJdaixzr0THod9u/f/92qOntx+ys+9Lds2cK+ffsG7jc7O8vMzMzaD+hVxjq8xFr0WIeeSa9Dkr9eqt3pHUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1Jasgr/hu562XLDfcuu/zQTZePaSSStHY80pekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNWTH0k9ya5FiSx/ra/jDJo93tUJJHu/YtSX7Yt+wP+vpclORAkoNJPpkkp+QVSZJOajXn6d8G/B7w2YWGqvqNhcdJPg58v2/9p6tq2xLbuRnYCTwMfAnYDtw38IglSUNb8Ui/qh4EnltqWXe0/uvAHcttI8lG4HVV9VBVFb0/IFcOPFpJ0khG/UbuPweOVtU3+9rOTfI14Hngt6rqy8AmYK5vnbmubUlJdtJ7V8DU1BSzs7MDD2x+fn6ofgt2XXhi2eWjbHucRq3DJLEWPdahp9U6jBr6V/H3j/KPAG+qqu8luQj4YpLzgaXm7+tkG62qPcAegOnp6Rrmx4tH/dHj61a6DMPVw297nCb9x58HYS16rENPq3UYOvSTnAb8S+CihbaqOg4c7x7vT/I08GZ6R/ab+7pvBg4Pu29J0nBGOWXzl4BvVNX/n7ZJcnaSDd3jnwG2At+qqiPAC0ku6T4HuAa4e4R9S5KGsJpTNu8AHgLekmQuyXu7RTt4+Qe4vwB8PclfAX8EvL+qFj4E/gDwGeAg8DSeuSNJY7fi9E5VXXWS9uuWaLsLuOsk6+8DLhhwfJKkNeQ3ciWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGrOaH0W9NcizJY31tH03ybJJHu9vb+5bdmORgkqeSXNbXflGSA92yTybJ2r8cSdJyVnOkfxuwfYn2T1TVtu72JYAk5wE7gPO7Pp9KsqFb/2ZgJ7C1uy21TUnSKbRi6FfVg8Bzq9zeFcDeqjpeVc8AB4GLk2wEXldVD1VVAZ8FrhxyzJKkIZ02Qt8PJrkG2Afsqqq/ATYBD/etM9e1/V33eHH7kpLspPeugKmpKWZnZwce3Pz8/FD9Fuy68MSyy0fZ9jiNWodJYi16rENPq3UYNvRvBj4GVHf/ceA9wFLz9LVM+5Kqag+wB2B6erpmZmYGHuDs7CzD9Ftw3Q33Lrv80NXDb3ucRq3DJLEWPdahp9U6DHX2TlUdraoXq+pHwKeBi7tFc8A5fatuBg537ZuXaJckjdFQod/N0S94J7BwZs89wI4kpyc5l94Hto9U1RHghSSXdGftXAPcPcK4JUlDWHF6J8kdwAxwVpI54CPATJJt9KZoDgHvA6iqx5PcCTwBnACur6oXu019gN6ZQGcA93U3SdIYrRj6VXXVEs23LLP+bmD3Eu37gAsGGp0kaU35jVxJaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ1Z8ecStbQtN9x70mWHbrp8jCORpNVb8Ug/ya1JjiV5rK/tvyT5RpKvJ/lCkp/s2rck+WGSR7vbH/T1uSjJgSQHk3wySU7JK5IkndRqpnduA7YvarsfuKCqfg7438CNfcuerqpt3e39fe03AzuBrd1t8TYlSafYiqFfVQ8Czy1q+/OqOtE9fRjYvNw2kmwEXldVD1VVAZ8FrhxqxJKkoa3FnP57gD/se35ukq8BzwO/VVVfBjYBc33rzHVtS0qyk967AqamppidnR14UPPz80P1W7DrwhMrr3QSo+x3rY1ah0liLXqsQ0+rdRgp9JP8R+AE8Pmu6Qjwpqr6XpKLgC8mOR9Yav6+TrbdqtoD7AGYnp6umZmZgcc2OzvLMP0WXLfMB7UrOXT18Ptda6PWYZJYix7r0NNqHYYO/STXAu8A3tZN2VBVx4Hj3eP9SZ4G3kzvyL5/CmgzcHjYfUuShjPUefpJtgP/AfjVqvrbvvazk2zoHv8MvQ9sv1VVR4AXklzSnbVzDXD3yKOXJA1kxSP9JHcAM8BZSeaAj9A7W+d04P7uzMuHuzN1fgH47SQngBeB91fVwofAH6B3JtAZwH3dTZI0RiuGflVdtUTzLSdZ9y7grpMs2wdcMNDoJElrysswSFJDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWpI07+Ru9zv3ErSJPJIX5IaYuhLUkOant45VVaaNjp00+VjGokk/X0e6UtSQwx9SWqIoS9JDTH0Jakhhr4kNWTF0E9ya5JjSR7ra3tDkvuTfLO7f33fshuTHEzyVJLL+tovSnKgW/bJdL+oLkkan9Uc6d8GbF/UdgPwQFVtBR7onpPkPGAHcH7X51NJNnR9bgZ2Alu72+JtSpJOsRVDv6oeBJ5b1HwFcHv3+Hbgyr72vVV1vKqeAQ4CFyfZCLyuqh6qqgI+29dHkjQmw345a6qqjgBU1ZEkb+zaNwEP960317X9Xfd4cfuSkuyk966AqakpZmdnBx7g/Pz8iv12XXhi4O2uhWFez7BWU4dWWIse69DTah3W+hu5S83T1zLtS6qqPcAegOnp6ZqZmRl4ILOzs6zU77r1uuDagR8su3gtv7G7mjq0wlr0WIeeVusw7Nk7R7spG7r7Y137HHBO33qbgcNd++Yl2iVJYzRs6N8DXNs9vha4u699R5LTk5xL7wPbR7qpoBeSXNKdtXNNXx9J0pisOL2T5A5gBjgryRzwEeAm4M4k7wW+DbwLoKoeT3In8ARwAri+ql7sNvUBemcCnQHc190kSWO0YuhX1VUnWfS2k6y/G9i9RPs+4IKBRidJWlN+I1eSGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyNChn+QtSR7tuz2f5MNJPprk2b72t/f1uTHJwSRPJblsbV6CJGm1Vvxh9JOpqqeAbQBJNgDPAl8A/jXwiar6nf71k5wH7ADOB34a+Iskb66qF4cdgyRpMGs1vfM24Omq+utl1rkC2FtVx6vqGeAgcPEa7V+StAprFfo7gDv6nn8wydeT3Jrk9V3bJuA7fevMdW2SpDFJVY22geTHgMPA+VV1NMkU8F2ggI8BG6vqPUl+H3ioqj7X9bsF+FJV3bXENncCOwGmpqYu2rt378Djmp+f58wzz1x2nQPPfn/g7Y7DhZt+Ys22tZo6tMJa9FiHnkmvw6WXXrq/qqYXtw89p9/nV4CvVtVRgIV7gCSfBv6kezoHnNPXbzO9PxYvU1V7gD0A09PTNTMzM/CgZmdnWanfdTfcO/B2x+HQ1TNrtq3V1KEV1qLHOvS0Woe1mN65ir6pnSQb+5a9E3ise3wPsCPJ6UnOBbYCj6zB/iVJqzTSkX6SfwT8MvC+vub/nGQbvemdQwvLqurxJHcCTwAngOs9c0eSxmuk0K+qvwV+alHbu5dZfzewe5R9SpKG5zdyJakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIaMFPpJDiU5kOTRJPu6tjckuT/JN7v71/etf2OSg0meSnLZqIOXJA1mLY70L62qbVU13T2/AXigqrYCD3TPSXIesAM4H9gOfCrJhjXYvyRplU7F9M4VwO3d49uBK/va91bV8ap6BjgIXHwK9i9JOolRQ7+AP0+yP8nOrm2qqo4AdPdv7No3Ad/p6zvXtUmSxuS0Efu/taoOJ3kjcH+SbyyzbpZoqyVX7P0B2QkwNTXF7OzswAObn59fsd+uC08MvN1xGOb1nsxq6tAKa9FjHXparcNIoV9Vh7v7Y0m+QG+65miSjVV1JMlG4Fi3+hxwTl/3zcDhk2x3D7AHYHp6umZmZgYe2+zsLCv1u+6Gewfe7jgcunpmzba1mjq0wlr0WIeeVusw9PROktcm+fGFx8C/AB4D7gGu7Va7Fri7e3wPsCPJ6UnOBbYCjwy7f0nS4EY50p8CvpBkYTv/var+NMlXgDuTvBf4NvAugKp6PMmdwBPACeD6qnpxpNFLkgYydOhX1beAn1+i/XvA207SZzewe9h9SpJG4zdyJakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDVk1Esr6xTYsszVPw/ddPkYRyJp0nikL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDRk69JOck+QvkzyZ5PEkH+raP5rk2SSPdre39/W5McnBJE8luWwtXoAkafVGuQzDCWBXVX01yY8D+5Pc3y37RFX9Tv/KSc4DdgDnAz8N/EWSN1fViyOMQZI0gKGP9KvqSFV9tXv8AvAksGmZLlcAe6vqeFU9AxwELh52/5KkwaWqRt9IsgV4ELgA+LfAdcDzwD567wb+JsnvAQ9X1ee6PrcA91XVHy2xvZ3AToCpqamL9u7dO/CY5ufnOfPMM5dd58Cz3x94u+vtwk0/sezyxa9p6gw4+sPV9Z10q/k30QLr0DPpdbj00kv3V9X04vaRr7KZ5EzgLuDDVfV8kpuBjwHV3X8ceA+QJbov+RenqvYAewCmp6drZmZm4HHNzs6yUr/rlrma5SvVoatnll2++DXtuvAEHz9w2qr6TrrV/JtogXXoabUOI529k+Q19AL/81X1xwBVdbSqXqyqHwGf5qUpnDngnL7um4HDo+xfkjSYUc7eCXAL8GRV/W5f+8a+1d4JPNY9vgfYkeT0JOcCW4FHht2/JGlwo0zvvBV4N3AgyaNd228CVyXZRm/q5hDwPoCqejzJncAT9M78ud4zdyRpvIYO/ar6nyw9T/+lZfrsBnYPu09J0mj8Rq4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyMiXYdDk2LLCZSkO3XT5mEYi6VSZ6NBfKcRejSbxNUkaH6d3JKkhhr4kNcTQl6SGTPScvsbHD4GlVweP9CWpIR7pa6L1vwPZdeGJl/2ymO9A1BqP9CWpIYa+JDXE0JekhjinLzXmwLPff9lnGwv8jGPyGfoai+VO6ZzUoGnxNb9SLfXfYuGD/db+W4w99JNsB/4bsAH4TFXdNO4xtGrU6/as13V/XqnfAfA6SHo1GmvoJ9kA/D7wy8Ac8JUk91TVE+Mch15ZXqnh+UodlzSKcR/pXwwcrKpvASTZC1wBGPoa2ijh/Ep99zKKUd75jDqu5fY9yrZXek2v1j/Q6zEFmKo6JRtecmfJrwHbq+rfdM/fDfzTqvrgovV2Aju7p28Bnhpid2cB3x1huJPCOrzEWvRYh55Jr8M/rqqzFzeO+0g/S7S97K9OVe0B9oy0o2RfVU2Pso1JYB1eYi16rENPq3UY93n6c8A5fc83A4fHPAZJata4Q/8rwNYk5yb5MWAHcM+YxyBJzRrr9E5VnUjyQeDP6J2yeWtVPX6KdjfS9NAEsQ4vsRY91qGnyTqM9YNcSdL68to7ktQQQ1+SGjJxoZ9ke5KnkhxMcsN6j2ecktya5FiSx/ra3pDk/iTf7O5fv55jHIck5yT5yyRPJnk8yYe69qZqkeQfJnkkyV91dfhPXXtTdeiXZEOSryX5k+55c7WYqNDvu8zDrwDnAVclOW99RzVWtwHbF7XdADxQVVuBB7rnk+4EsKuqfha4BLi++3fQWi2OA79YVT8PbAO2J7mE9urQ70PAk33Pm6vFRIU+fZd5qKr/Cyxc5qEJVfUg8Nyi5iuA27vHtwNXjnNM66GqjlTVV7vHL9D7n3wTjdWieua7p6/pbkVjdViQZDNwOfCZvubmajFpob8J+E7f87murWVTVXUEemEIvHGdxzNWSbYA/wT4XzRYi24641HgGHB/VTVZh85/Bf498KO+tuZqMWmhv6rLPKgNSc4E7gI+XFXPr/d41kNVvVhV2+h9+/3iJBes85DWRZJ3AMeqav96j2W9TVroe5mHlzuaZCNAd39sncczFkleQy/wP19Vf9w1N1kLgKr6P8Asvc98WqzDW4FfTXKI3rTvLyb5HA3WYtJC38s8vNw9wLXd42uBu9dxLGORJMAtwJNV9bt9i5qqRZKzk/xk9/gM4JeAb9BYHQCq6saq2lxVW+jlwv+oqn9Fg7WYuG/kJnk7vbm7hcs87F7fEY1PkjuAGXqXjD0KfAT4InAn8Cbg28C7qmrxh70TJck/A74MHOCl+dvfpDev30wtkvwcvQ8nN9A7wLuzqn47yU/RUB0WSzID/LuqekeLtZi40JckndykTe9IkpZh6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SG/D8WXZv35dC0cAAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_contracts[df_contracts.annual_salary < 50].annual_salary.hist(bins=40)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAD4CAYAAAAEhuazAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQaUlEQVR4nO3df6xf9V3H8efLMpHQbRTZGkLRommMQBXlBknmzG02Rx1LYEZMCRklznRZINkS/ljZP5uaJo1xU5cJsROyks01jduksaISshtcMmTtgpYfQ5pRsbRpM2GMLgYte/vHPa3fdd/b9nt7f33P5/lIvvme8znnfM/nndP7uud+zvmepqqQJLXlJxa7A5KkhWf4S1KDDH9JapDhL0kNMvwlqUHnLXYHzuSSSy6p1atXn5z/wQ9+wIUXXrh4HZonfa0L+lubdY2fvtY2rK69e/d+t6reNtM2Sz78V69ezZ49e07OT01NMTk5uXgdmid9rQv6W5t1jZ++1jasriT/cbptHPaRpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGLflv+EpzbfXm3SOtf2DrjfPUE2nxeOYvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSg84Y/kkuT/K1JM8meTrJR7r2i5M8kuT57n3FwDb3JNmf5LkkNwy0X5tkX7fsM0kyP2VJkk7nbM78jwN3V9UvAtcDdya5EtgMPFpVa4BHu3m6ZRuAq4D1wL1JlnWfdR+wCVjTvdbPYS2SpLN0xvCvqsNV9a1u+jXgWeAy4CZge7faduDmbvomYEdVvV5VLwD7geuSXAq8paq+UVUFPDiwjSRpAWU6h89y5WQ18BhwNfBiVV00sOyVqlqR5LPA41X1ha79fuBh4ACwtare3bW/E/hYVb1vyH42Mf0XAitXrrx2x44dJ5cdO3aM5cuXj1blGOhrXbD0atv30qsjrb/2srcObV9qdc2VvtYF/a1tWF3r1q3bW1UTM21z1v+Hb5LlwJeBj1bV908zXD9sQZ2m/ccbq7YB2wAmJiZqcnLy5LKpqSkG5/uir3XB0qvtjlH/D9/bJoe2L7W65kpf64L+1jabus7qbp8kb2I6+L9YVV/pmo90Qzl070e79oPA5QObrwIOde2rhrRLkhbY2dztE+B+4Nmq+vTAol3Axm56I/DQQPuGJOcnuYLpC7tPVNVh4LUk13efefvANpKkBXQ2wz7vAD4A7EvyZNf2cWArsDPJB4EXgVsAqurpJDuBZ5i+U+jOqnqj2+7DwOeBC5i+DvDw3JQhSRrFGcO/qr7O8PF6gHfNsM0WYMuQ9j1MXyyWJC0iv+ErSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhp0xvBP8kCSo0meGmj7ZJKXkjzZvd47sOyeJPuTPJfkhoH2a5Ps65Z9JknmvhxJ0tk4mzP/zwPrh7T/aVVd073+HiDJlcAG4Kpum3uTLOvWvw/YBKzpXsM+U5K0AM4Y/lX1GPDyWX7eTcCOqnq9ql4A9gPXJbkUeEtVfaOqCngQuHmWfZYknaPzzmHbu5LcDuwB7q6qV4DLgMcH1jnYtf1vN31q+1BJNjH9VwIrV65kamrq5LJjx479yHxf9LUuWHq13b32+Ejrz9T3pVbXXOlrXdDf2mZT12zD/z7gj4Dq3j8F/B4wbBy/TtM+VFVtA7YBTExM1OTk5MllU1NTDM73RV/rgqVX2x2bd4+0/oHbJoe2L7W65kpf64L+1jabumZ1t09VHamqN6rqh8DngOu6RQeBywdWXQUc6tpXDWmXJC2CWYV/N4Z/wvuBE3cC7QI2JDk/yRVMX9h9oqoOA68lub67y+d24KFz6Lck6RyccdgnyZeASeCSJAeBTwCTSa5heujmAPAhgKp6OslO4BngOHBnVb3RfdSHmb5z6ALg4e4lSVoEZwz/qrp1SPP9p1l/C7BlSPse4OqReidJmhd+w1eSGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IadMb/yUtaSKs37x55mwNbb5yHnkj95pm/JDXI8JekBjnso7E3m6EiqXWe+UtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDzhj+SR5IcjTJUwNtFyd5JMnz3fuKgWX3JNmf5LkkNwy0X5tkX7fsM0ky9+VIks7G2Zz5fx5Yf0rbZuDRqloDPNrNk+RKYANwVbfNvUmWddvcB2wC1nSvUz9TkrRAzhj+VfUY8PIpzTcB27vp7cDNA+07qur1qnoB2A9cl+RS4C1V9Y2qKuDBgW0kSQss01l8hpWS1cDfVdXV3fz3quqigeWvVNWKJJ8FHq+qL3Tt9wMPAweArVX17q79ncDHqup9M+xvE9N/JbBy5cprd+zYcXLZsWPHWL58+eiVLnF9rQtGq23fS6/Oc29Gt/aytw5t7+sx62td0N/ahtW1bt26vVU1MdM2581xH4aN49dp2oeqqm3ANoCJiYmanJw8uWxqaorB+b7oa10wWm13bN49v52ZhQO3TQ5t7+sx62td0N/aZlPXbMP/SJJLq+pwN6RztGs/CFw+sN4q4FDXvmpIu7TkrZ7hF9Lda48P/WV1YOuN890l6ZzN9lbPXcDGbnoj8NBA+4Yk5ye5gukLu09U1WHgtSTXd3f53D6wjSRpgZ3xzD/Jl4BJ4JIkB4FPAFuBnUk+CLwI3AJQVU8n2Qk8AxwH7qyqN7qP+jDTdw5dwPR1gIfntBJJ0lk7Y/hX1a0zLHrXDOtvAbYMad8DXD1S7yRJ82KuL/hKzZvpGsFMvEagxeDjHSSpQYa/JDXI8JekBjnmr3m1evPuGe+Hl7R4PPOXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUHnLXYHJI1u9ebdI61/YOuN89QTjSvP/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUHnFP5JDiTZl+TJJHu6touTPJLk+e59xcD69yTZn+S5JDeca+clSbMzF2f+66rqmqqa6OY3A49W1Rrg0W6eJFcCG4CrgPXAvUmWzcH+JUkjmo9hn5uA7d30duDmgfYdVfV6Vb0A7Aeum4f9S5LOIFU1+42TF4BXgAL+sqq2JfleVV00sM4rVbUiyWeBx6vqC137/cDDVfU3Qz53E7AJYOXKldfu2LHj5LJjx46xfPnyWfd5qeprXfteepWVF8CR/17snsy9uapr7WVvHXmbfS+9Om/76Ou/RehvbcPqWrdu3d6BEZkfc65P9XxHVR1K8nbgkSTfPs26GdI29DdPVW0DtgFMTEzU5OTkyWVTU1MMzvdFX+u6Y/Nu7l57nE/t698DZOeqrgO3TY68zR2jPtVzhH309d8i9Le22dR1TsM+VXWoez8KfJXpYZwjSS4F6N6PdqsfBC4f2HwVcOhc9i9Jmp1Zh3+SC5O8+cQ08B7gKWAXsLFbbSPwUDe9C9iQ5PwkVwBrgCdmu39J0uydy9+sK4GvJjnxOX9dVf+Q5JvAziQfBF4EbgGoqqeT7ASeAY4Dd1bVG+fUe0nSrMw6/KvqO8AvD2n/L+BdM2yzBdgy231KkuaG3/CVpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDerfd+6lMbN6xEc1zPc+7l57nMn564qWCM/8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSg85b7A5ovKzevHuxuyBpDnjmL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIO/2aZx370htMvwlnbNRTyIObL1xnnqis2X4S1ryZvMXqr9gTm/Bwz/JeuDPgWXAX1XV1oXug6TTcziw/xY0/JMsA/4C+E3gIPDNJLuq6pmF7Mc48YdQ0nxY6DP/64D9VfUdgCQ7gJsAw19qyEKc1Azbx91rj3PHHO171GGlpXZdJFU1rzv4kZ0lvwOsr6rf7+Y/APxaVd11ynqbgE3d7C8Azw0svgT47gJ0d6H1tS7ob23WNX76Wtuwun62qt420wYLfeafIW0/9tunqrYB24Z+QLKnqibmumOLra91QX9rs67x09faZlPXQn/J6yBw+cD8KuDQAvdBkpq30OH/TWBNkiuS/CSwAdi1wH2QpOYt6LBPVR1Pchfwj0zf6vlAVT094scMHQ7qgb7WBf2tzbrGT19rG7muBb3gK0laGnywmyQ1yPCXpAaNTfgnWZ/kuST7k2xe7P7MpSQHkuxL8mSSPYvdn9lK8kCSo0meGmi7OMkjSZ7v3lcsZh9na4baPpnkpe64PZnkvYvZx9lIcnmSryV5NsnTST7StY/1cTtNXX04Zj+V5Ikk/9rV9gdd+0jHbCzG/LvHQvw7A4+FAG7ty2MhkhwAJqpqrL98kuQ3gGPAg1V1ddf2x8DLVbW1+6W9oqo+tpj9nI0ZavskcKyq/mQx+3YuklwKXFpV30ryZmAvcDNwB2N83E5T1+8y/scswIVVdSzJm4CvAx8BfpsRjtm4nPmffCxEVf0PcOKxEFpCquox4OVTmm8CtnfT25n+ARw7M9Q29qrqcFV9q5t+DXgWuIwxP26nqWvs1bRj3eybulcx4jEbl/C/DPjPgfmD9ORAdgr4pyR7u0db9MnKqjoM0z+QwNsXuT9z7a4k/9YNC43V0MipkqwGfgX4F3p03E6pC3pwzJIsS/IkcBR4pKpGPmbjEv5n9ViIMfaOqvpV4LeAO7shBi199wE/D1wDHAY+tai9OQdJlgNfBj5aVd9f7P7MlSF19eKYVdUbVXUN009JuC7J1aN+xriEf68fC1FVh7r3o8BXmR7m6osj3fjriXHYo4vcnzlTVUe6H8IfAp9jTI9bN278ZeCLVfWVrnnsj9uwuvpyzE6oqu8BU8B6Rjxm4xL+vX0sRJILuwtSJLkQeA/w1Om3Giu7gI3d9EbgoUXsy5w68YPWeT9jeNy6i4f3A89W1acHFo31cZuprp4cs7cluaibvgB4N/BtRjxmY3G3D0B3S9af8f+PhdiyuD2aG0l+jumzfZh+3MZfj2ttSb4ETDL9eNkjwCeAvwV2Aj8DvAjcUlVjd+F0htommR4+KOAA8KETY67jIsmvA/8M7AN+2DV/nOnx8bE9bqep61bG/5j9EtMXdJcxfQK/s6r+MMlPM8IxG5vwlyTNnXEZ9pEkzSHDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXo/wCyR1tTDJOpXgAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "df_contracts[df_contracts.a_age < 30].a_age.hist(bins=25)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Two very important distributions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Normal\n",
+ "\n",
+ "Also known as Gaussian, is a bell-shaped distribution with mass around the mean and exponentially decaying on the sides. It is fully characterized by the mean (center of mass) and standard deviation (spread).\n",
+ "\n",
+ "https://en.wikipedia.org/wiki/Normal_distribution"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAFgCAYAAACFYaNMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAWpElEQVR4nO3df5Dc9X3f8efb0hkMsoIVJHxIXE5MMRH4hvjmQh3T6TiGJmriBCcTXKWNq+mo1R8oNm4zDsKdTtPpqMNMGY8zLqTRkNRKYhurDh6UlIEYbNzpDDZgx81anF0roOAzFw6wXZmkkJN494/9nvzV6iQt0n73s3v7fMwwt/vZ7/fuvaO9F9/7fD8/IjORJPXf60oXIEmjygCWpEIMYEkqxACWpEIMYEkqZHXpAs7F1q1b84EHHihdhiSdSSzXONRXwC+88ELpEiTprA11AEvSMDOAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSChnq5SilQbK4uEir1Tr+fGpqirGxsYIVadAZwFKPtFotbr7zAGvHJzkyf5i7dsH09HTpsjTADGCph9aOT7Ju4srSZWhIGMBSA149dpTZ2dkT2uySUKdGAzgiDgM/AI4BRzNzJiLWAZ8GJoHDwHsz83vV8bcBO6rjP5CZDzZZn9SUlxbmuOP+l1k/uwhgl4SW1Y8r4J/OzPreQbuBhzPz9ojYXT2/NSKuArYBVwOXAg9FxFsy81gfapR6bs2GCbsjdFolhqHdCOyrHu8D3lNrvyczX8nMp4FDwLX9L0+S+qPpK+AE/iwiEvjdzNwLXJKZ8wCZOR8RG6pjNwJfqp07V7WdICJ2AjsBJiYmmqxdakznkDWwj3gUNR3A12Xms1XIfi4ivnGaY5fbtjlPamiH+F6AmZmZk16XhkF9yBrYRzyqGg3gzHy2+roQEZ+l3aXwXESMV1e/48BCdfgccFnt9E3As03WJ5XkkDU11gccERdGxBuXHgM/A3wdOABsrw7bDtxXPT4AbIuI8yJiM3AF8FhT9UlSaU1eAV8CfDYiln7OJzPzgYh4HNgfETuAZ4CbADLzYETsB54EjgK7HAEhaSVrLIAz8yngmmXaXwSuP8U5e4A9TdUkSYPEmXDSAOicOeeIiNFgAEsDoD5zzhERo8MAlgaEM+dGjwEsnaXOyRSzs7OQDk1X9wxg6Sx1TqaYbz3KRZefdN9ZOiUDWDoH9ckUR+YPly1GQ8c94SSpEANYkgoxgCWpEANYkgoxgCWpEEdBSH3QOdXYMcMCA1jqi85NOh0zLDCApb6pTzV2zLDAPmBJKsYAlqRC7IKQuuTiO+o1A1jqkovvqNcMYOk1cPEd9ZJ9wJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiOOApQHXOQMPYGpqirGxsUIVqVcMYGnAdc7AOzJ/mLt2wfT0dNnCdM4MYGkI1GfgaeWwD1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQF2SXBsyrx44yOzt7/Pns7CxkFqxITTGApQHz0sIcd9z/MutnFwGYbz3KRZdfU7gqNcEAlgbQmg0Tx7cgOjJ/uGwxaox9wJJUiAEsSYUYwJJUSON9wBGxCngC+E5mvjsi1gGfBiaBw8B7M/N71bG3ATuAY8AHMvPBpuuTliwuLtJqtU5om5qaYmxsrFBFWun6cRPuFmAWWFs93w08nJm3R8Tu6vmtEXEVsA24GrgUeCgi3pKZx/pQo0Sr1eLmOw+wdnwSaN/8umsXTE9Ply1MK1ajXRARsQn4eeDuWvONwL7q8T7gPbX2ezLzlcx8GjgEXNtkfVKnteOTrJu4knUTVx4PYqkpTV8BfxT4TeCNtbZLMnMeIDPnI2JD1b4R+FLtuLmq7QQRsRPYCTAxMdFAyVKbEyLUtMYCOCLeDSxk5lci4p3dnLJM20mf9szcC+wFmJmZ8bdBjXFChJrW5BXwdcAvRsTPAecDayPij4DnImK8uvodBxaq4+eAy2rnbwKebbA+6YycEKEmNdYHnJm3ZeamzJykfXPt85n5a8ABYHt12HbgvurxAWBbRJwXEZuBK4DHmqpPkkorMRX5dmB/ROwAngFuAsjMgxGxH3gSOArscgSEpJWsLwGcmY8Aj1SPXwSuP8Vxe4A9/ahJkkpzJpwkFWIAS1IhBrAkFWIAS1IhBrAkFWIAS1IhBrAkFWIAS1IhBrAkFWIAS1IhBrAkFVJiMR5J56BzoXhw77phZQBLQ6ZzoXj3rhteBrA0hOoLxWt42QcsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYUYwJJUyOrSBUj9tLi4SKvVOqFtamqKsbGxQhWdu1ePHWV2dvb482F/P6PEANZIabVa3HznAdaOTwLw/e/8JbfcMMuWLVvaIZZZtsCz8NLCHHfc/zLrZxc5Mn+Yu3bB9PR06bLUBQNYI2ft+CTrJq4E4Mj8Ye64v8X62UXmW49y0eXXFK7u7KzZMHH8PWl42AeskbcUXhdePF66FI0YA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCjGAJakQA1iSCmksgCPi/Ih4LCL+d0QcjIj/ULWvi4jPRcS3qq9vqp1zW0QciohvRsTPNlWbJA2CJq+AXwHelZnXAD8BbI2ItwO7gYcz8wrg4eo5EXEVsA24GtgK3BURqxqsT5KKaiyAs+2l6ulY9V8CNwL7qvZ9wHuqxzcC92TmK5n5NHAIuLap+iSptEb7gCNiVUR8DVgAPpeZXwYuycx5gOrrhurwjcC3a6fPVW2d33NnRDwREU88//zzTZYvSY1qdDnKzDwG/EREXAR8NiLeeprDY7lvscz33AvsBZiZmRm+xVvVV50LsA/rmr9amfqyHnBmfj8iHqHdt/tcRIxn5nxEjNO+Oob2Fe9ltdM2Ac/2oz6tXJ0LsA/zmr9aeZocBbG+uvIlIt4A3AB8AzgAbK8O2w7cVz0+AGyLiPMiYjNwBfBYU/VpdCwtwO6avxo0TV4BjwP7qpEMrwP2Z+afRsSjwP6I2AE8A9wEkJkHI2I/8CRwFNhVdWFI0orUWABn5l8Ab1um/UXg+lOcswfY01RNkjRInAknSYV0FcARcV03bZKk7nV7BfyxLtskSV06bR9wRPwU8A5gfUT8m9pLawGnCUsD5tVjR9tjnWumpqYYGxsrVJFO50w34V4PrKmOe2Ot/QjwK00VJensvLQwxx33v8z62UUAjswf5q5dMD09XbgyLee0AZyZXwS+GBEfz8y/6lNNks7Bmg0TrJu4snQZ6kK3w9DOi4i9wGT9nMx8VxNFSdIo6DaA/zvwX4G7ASdHSFIPdBvARzPzdxqtRJJGTLfD0P4kIm6OiPFqR4t1EbGu0cokaYXr9gp4afGcD9XaEri8t+VI6iWHpQ22rgI4Mzc3XYik3nNY2mDrKoAj4p8v156Zf9DbciT1msPSBle3XRA/WXt8Pu3VzL4KGMCSdJa67YJ4f/15RPwI8IeNVCRJI+Jsl6P8W9o7VkiSzlK3fcB/wg83yFwFbAH2N1WUJI2CbvuA76g9Pgr8VWbONVCPJI2MbvuAvxgRl/DDm3Hfaq4k6ey5Db2GSbddEO8F/jPwCBDAxyLiQ5n5mQZrk14zt6HXMOm2C+LfAj+ZmQvQ3nIeeAgwgDVwlrahh/bEA2lQdTsK4nVL4Vt58TWcK0laRrdXwA9ExIPAp6rn/wS4v5mSJGk0nGlPuL8HXJKZH4qIXwb+Ae0+4EeBT/ShPkk95OI8g+VMV8AfBT4MkJn3AvcCRMRM9dovNFibpB5zcZ7BcqYAnszMv+hszMwnImKymZIkNcnFeQbHmW6knX+a197Qy0IkadScKYAfj4h/1dkYETuArzRTkiSNhjN1QXwQ+GxE/DN+GLgzwOuBX2qwLkla8U4bwJn5HPCOiPhp4K1V8//IzM83XpkkrXDdrgXxBeALDdciSSPF2WySVIgBLEmFGMCSVIgBLEmFGMCSVIgBLEmFGMCSVIgBLEmFGMCSVIgBLEmFGMCSVEi3e8JJWoE6tyhye6L+MoClEVbfosjtifrPAJZGnFsUlWMfsCQVYgBLUiEGsCQVYgBLUiEGsCQVYgBLUiGNBXBEXBYRX4iI2Yg4GBG3VO3rIuJzEfGt6uubaufcFhGHIuKbEfGzTdUmSYOgySvgo8BvZOYW4O3Aroi4CtgNPJyZVwAPV8+pXtsGXA1sBe6KiFUN1idJRTU2ESMz54H56vEPImIW2AjcCLyzOmwf8Ahwa9V+T2a+AjwdEYeAa4FHm6pRK8Pi4iKtVgugPa02s3BFUnf6MhMuIiaBtwFfBi6pwpnMnI+IDdVhG4Ev1U6bq9o6v9dOYCfAxMREg1VrWLRaLW6+8wBrxyeZbz3KRZdfU7okqSuN34SLiDXAHwMfzMwjpzt0mbaTLmUyc29mzmTmzPr163tVpobc2vFJ1k1cyYUXj5cuRepao1fAETFGO3w/kZn3Vs3PRcR4dfU7DixU7XPAZbXTNwHPNlmfpB/qXBkNXB2taY0FcEQE8HvAbGZ+pPbSAWA7cHv19b5a+ycj4iPApcAVwGNN1SfpRPWV0QBXR+uDJq+ArwPeB7Qi4mtV24dpB+/+iNgBPAPcBJCZByNiP/Ak7REUuzLzWIP1Sergymj91eQoiP/F8v26ANef4pw9wJ6mapKkQeJMOEkqxACWpEIMYEkqxACWpEIMYEkqxACWpEIMYEkqxACWpEIMYEkqxACWpEL6sh6w1Ev1BdjBRdg1vAxgDZ36AuyAi7BraBnAGkpLC7BDe9lEaRjZByxJhRjAklSIXRAaeN5000plAGvgedNNK5UBrKHgTTetRPYBS1IhBrAkFWIAS1IhBrAkFWIAS1IhBrAkFWIAS1IhBrAkFWIAS1IhzoSTtKxXjx1tr7tRMzU1xdjYWKGKVh4DWNKyXlqY4477X2b97CLQngJ+1y6Ynp4uXNnKYQBLOqU1GyaOr8Gh3rMPWJIK8QpYUlfsE+49A1hSV+wT7j0DWFLX7BPuLfuAJakQA1iSCrELQgPHTTg1KgxgDRw34dSoMIA1kNyEU6PAPmBJKsQAlqRCDGBJKsQAlqRCDGBJKsQAlqRCDGBJKsQAlqRCDGBJKsSZcBoI9fUfXPtBo8IA1kCor//g2g8aFXZBaGAsrf9w4cXjpUuR+qKxK+CI+H3g3cBCZr61alsHfBqYBA4D783M71Wv3QbsAI4BH8jMB5uqTdK5c4+4c9dkF8THgf8C/EGtbTfwcGbeHhG7q+e3RsRVwDbgauBS4KGIeEtmHmuwPknnwD3izl1jAZyZ/zMiJjuabwTeWT3eBzwC3Fq135OZrwBPR8Qh4Frg0abqk3Tu3CPu3PS7D/iSzJwHqL5uqNo3At+uHTdXtZ0kInZGxBMR8cTzzz/faLGS1KRBuQkXy7QtOw4pM/dm5kxmzqxfv77hsiSpOf0O4OciYhyg+rpQtc8Bl9WO2wQ82+faJKmv+h3AB4Dt1ePtwH219m0RcV5EbAauAB7rc22S1FdNDkP7FO0bbhdHxBzw74Hbgf0RsQN4BrgJIDMPRsR+4EngKLDLERCSVromR0H86ileuv4Ux+8B9jRVjwaLW89LTkVWIW49LxnAKsit5zXqBmUYmiSNHK+A1Rf2+UonM4DVF/b5SiczgNU39vlKJ7IPWJIKMYAlqRADWJIKMYAlqRADWJIKMYAlqRADWJIKcRywpJ7o3CXZHZLPzACW1BP1XZLdIbk7BrCknnGX5NfGPmBJKsQAlqRCDGBJKsQAlqRCvAknqec6h6SBw9KWYwBL6rn6kDTAYWmnYABLaoRD0s7MAFYj3ANOOjMDWI1wDzjpzAxgNcY94HQqnX8hwWjepDOAJfVd519Io3qTzgCWVET9L6RR5UQMSSrEK2D1TL1fz1EPquucmOHno80AVs/U+/Uc9aC6zokZfj7aDGD11FK/nqMe1Kk+McPPR5t9wJJUiAEsSYUYwJJUiAEsSYUYwJJUiAEsSYU4DE1nzSUnpXNjAOusueSkeqVzplx9ZbSVvHKaAaxz4pKT6oX6TLnOldFW8sppBrC6ZpeDmnS6LYxW6sppBrC6ZpeD1FsGsI7rpq/NLgepdwxgHbeS+9o0PEZp6UoDWCdYqX1tGh6jtHSlASxp4Jxu6crOK2QY3mFpBrCkodJ5hTzMXWUGsE5plPriNFxON2RtmBjAI6ZzpMPi4iIRwerVq08K2FHqi9Pwql8o1D/PSwa5e8IAHjHLjeVdveZNrN/848sGrNvIaNDVLxTqn2cY/O6JgQvgiNgK/DawCrg7M28vXNJQW2722to3/9gJoTq2doP7uGmoLV0o1D/Pyxm0dSUGKoAjYhVwJ/CPgDng8Yg4kJlP9upnDNo/wJJ6XWf6M+q1vAdnr2mULXcf42MP/x/WXroZgO9/5y+55YZZtmzZcvyYpd+lfmTFQAUwcC1wKDOfAoiIe4AbgZ4FcKvV4n3/7mNc8KNvBuBvX/xrPvxPbzjhH6CE2dlZ/tMnH+KCH30zLz51kFVveCMXjU8sW2P92OVe7/y+nV5aeIbvXngBAH/zwjyrX36Z7154wQmPO197rc9H7dxBqcP3f+Jrzz35OL/1xEtcNP51AF586iA/8mNXHf9d+H/fXeC3Pv7U8dfrv0vL/Z794X98f0+7MyIH6K52RPwKsDUz/2X1/H3A38/MX68dsxPYWT29EvhmD370xcALPfg+w2YU37fveXQM0vt+ITO3djYO2hVwLNN2wv8hMnMvsLenPzTiicyc6eX3HAaj+L59z6NjGN73oG1JNAdcVnu+CXi2UC2S1KhBC+DHgSsiYnNEvB7YBhwoXJMkNWKguiAy82hE/DrwIO1haL+fmQf78KN72qUxREbxffueR8fAv++BugknSaNk0LogJGlkGMCSVMjIB3BEbI2Ib0bEoYjYXbqepkXEZRHxhYiYjYiDEXFL6Zr6JSJWRcSfR8Sflq6lXyLiooj4TER8o/o3/6nSNTUtIv519dn+ekR8KiLOL13TqYx0ANemPv9j4CrgVyPiqtOfNfSOAr+RmVuAtwO7RuA9L7kFOHlq4Mr228ADmfnjwDWs8PcfERuBDwAzmflW2jfzt5Wt6tRGOoCpTX3OzL8DlqY+r1iZOZ+ZX60e/4D2L+TGslU1LyI2AT8P3F26ln6JiLXAPwR+DyAz/y4zv1+0qP5YDbwhIlYDFzDAcwlGPYA3At+uPZ9jBMJoSURMAm8Dvly4lH74KPCbwKuF6+iny4Hngf9Wdb3cHREXli6qSZn5HeAO4BlgHvi/mflnZas6tVEP4DNOfV6pImIN8MfABzPzSOl6mhQR7wYWMvMrpWvps9XANPA7mfk24G+AFX2fIyLeRPuv2M3ApcCFEfFrZas6tVEP4JGc+hwRY7TD9xOZeW/pevrgOuAXI+Iw7W6md0XEH5UtqS/mgLnMXPoL5zO0A3kluwF4OjOfz8xF4F7gHYVrOqVRD+CRm/ocEUG7T3A2Mz9Sup5+yMzbMnNTZk7S/jf+fGYO7FVRr2TmXwPfjoil1cmvp4dLuw6oZ4C3R8QF1Wf9egb4xuNATUXut4JTn0u6Dngf0IqIr1VtH87M+8uVpAa9H/hEdYHxFPAvCtfTqMz8ckR8Bvgq7RE/f84AT0l2KrIkFTLqXRCSVIwBLEmFGMCSVIgBLEmFGMCSVIgBLEmFGMCSVMj/B/qjFlrdIbQIAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "s1 = np.random.normal(5, 1, 10000)\n",
+ "sns.displot(s1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAD4CAYAAADSIzzWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAMCElEQVR4nO3db2hd9R3H8c+3SWcTi2xWURaNV7lzbZkMtzDchFHbirnZ2B4JDreEgRSqi10djK1cEOHioyGTIANxGymTDXQ+mCPpprjHskQt1aaTg4u1md1qZP5ZUm3a3x7kz24al9y09+ZzkrxfUG567+k533N6z7snJwmNlJIAACtvg3sAAFivCDAAmBBgADAhwABgQoABwKR5OQtfccUVqVAoNGgUAFibhoeH30kpXXn+88sKcKFQ0NDQUP2mAoB1ICLe/KTnuQUBACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmCzr/4QD8qyvr09ZljVk3WNjY5Kktra2hqy/WCyqt7e3IetGfhFgrBlZlumVV0d0tvXyuq+7aeI9SdLJj+p/yjRNvFv3dWJ1IMBYU862Xq7JrV11X2/LsQFJaui6sf5wDxgATAgwAJgQYAAwIcAAYEKAAcCEAAOACQEGABMCDAAmBBgATAgwAJgQYAAwIcAAYEKAAcCEAAOACQEGABMCDAAmBBgATAgwAJgQYAAwIcAAYEKAAcCEAAOACQEGABMCDAAmBBgATAgwAJgQYAAwIcAAYEKAAcCEAAOACQEGABMCDAAmBBgATAgwAJgQYAAwIcAAYEKAAcCEAAOACQEGABMCDAAmBBgATAjwKtXX16e+vj73GEDdraf3drN7AFyYLMvcIwANsZ7e21wBA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADBZkQCPj4/r/vvv1/j4+Eps7qLNzptl2bzHWuavZV+rl8myTKVSSXv27NHw8LBKpZLuvvtu3XbbbXr22WdVKpV01113aceOHXO/du3apcOHD+vw4cPKsqyeuw7kwsTEhHbv3q0dO3boscceU2dn57xzoLu7W1mW6d5779XevXv1wgsvaOfOneru7tbevXvnzr/x8XHdc8896urqUpZl8869WrvUyH6tSID7+/t15MgRHTx4cCU2d9Fm561UKvMea5m/ln2tXqZSqWhyclKvv/66HnzwQU1OTmpsbEwpJT3yyCOanJzUyZMn5/35s2fPzn1cqVQufEeBnDp+/LimpqYkSU899ZROnz694PVKpaKjR49qZGREDz/8sM6dO6fjx49rZGRk7vzr7+9XlmWamJhQpVKZd+7V2qVG9qvhAR4fH9ehQ4eUUtKhQ4dyfxVcPe/o6Oi8x6Xmr2Vfq5cZGBjQ6Ojo3GsffvjhvGVTSkvOOzo6ylUw1pSJiQl99NFHSy5Xfe7MxnrW4OCgsizTwMDAvOUHBweVUtLg4ODcx4ud143uV3Nd1/YJ+vv7de7cOUnTV24HDx7U/v37G73ZC1Y97/mWmr+Wfa1e5syZM3WZ+b777tPWrVvrsq7VLMsybfh46X+08mbD6feVZR9o37597lFyoR4XFGfOnFGlUlkQ5tlzrvrcW+y8bnS/lrwCjog9ETEUEUOnTp1a9gaef/75uYMwNTWl5557bvlTrqDqec+31Py17Oti679QtVwtAKtFLZ/51bKO6ivkT3p9djuLndeN7teSV8AppcclPS5JHR0dyz4yu3fv1sDAgKamptTc3Kzbb7/9AsZcOdXznm+p+WvZ18XWf6EKhYIeffTRuq1vtdq3b5+G3/ine4xlO7fpMhVvuIq/wxl33HHHRV9URISuu+66/xvhiJA0HeLFzutG96vh94B7enq0YcP0ZpqamtTd3d3oTV6U6nnPt9T8texr9TIbN26sw8RSuVyuy3qAPGhvb7/odWzcuFHlclnNzc0Lnp99nH1tsfO60f1qeIC3bNmizs5ORYQ6Ozu1ZcuWRm/yolTPWygU5j0uNX8t+1q9TFdXlwqFwtxrmzdvnrfs7L/SiykUCioWi7XvIJBzra2tuuSSS5ZcrvrcOT+0pVJJxWJRXV1d85YvlUqKCJVKpbmPFzuvG92vFfk2tJ6eHt100025v/qdNTtvuVye91jL/LXsa/Uy5XJZLS0tuvHGG/XQQw+ppaVFbW1tigg98MADamlp0dVXXz3vzzc1Nc19zNUv1qL29va5qN55553atGnTgtfL5bK2b9+ubdu26cCBA9qwYYPa29u1bdu2ufOvp6dHxWJRra2tKpfL8869WrvUyH7Fcm54d3R0pKGhoboPgeWb/Yo59w3/Z/Ye8OTWrqUXXqaWY9PfztSodX+Ze8Bz1uJ7OyKGU0od5z/PjyIDgAkBBgATAgwAJgQYAEwIMACYEGAAMCHAAGBCgAHAhAADgAkBBgATAgwAJgQYAEwIMACYEGAAMCHAAGBCgAHAhAADgAkBBgATAgwAJgQYAEwIMACYEGAAMCHAAGBCgAHAhAADgAkBBgATAgwAJgQYAEwIMACYEGAAMCHAAGBCgAHAhAADgAkBBgATAgwAJgQYAEwIMACYEGAAMGl2D4ALUywW3SMADbGe3tsEeJXq7e11jwA0xHp6b3MLAgBMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACYEGABMCDAAmBBgADAhwABgQoABwIQAA4AJAQYAEwIMACbN7gGAemqaeFctxwYasN5xSWrQut+VdFXd14v8I8BYM4rFYsPWPTY2JUlqa2tEKK9q6OzILwKMNaO3t9c9ArAs3AMGABMCDAAmBBgATAgwAJgQYAAwIcAAYEKAAcCEAAOACQEGABMCDAAmBBgATAgwAJgQYAAwIcAAYEKAAcCEAAOACQEGABMCDAAmBBgATAgwAJhESqn2hSNOSXqzjtu/QtI7dVzfWsAxWYhjshDHZKE8H5PrUkpXnv/ksgJcbxExlFLqsA2QQxyThTgmC3FMFlqNx4RbEABgQoABwMQd4MfN288jjslCHJOFOCYLrbpjYr0HDADrmfsKGADWLQIMACaWAEdEZ0T8LSKyiPiJY4Y8iYhrI+IvETESEa9FxD73THkREU0R8XJE/NE9Sx5ExKcj4umIODbzfvmqeya3iNg/c968GhG/jYhN7plqteIBjogmSY9JKknaLuk7EbF9pefImSlJP0opbZN0i6T7OCZz9kkacQ+RI49KOpRS2irpi1rnxyYi2iTdL6kjpfQFSU2S7vJOVTvHFfBXJGUppTdSSh9L+p2kbxvmyI2U0tsppZdmPv5A0ydVm3cqv4i4RtI3JD3hniUPIuIySV+X9EtJSil9nFL6t3WofGiW1BIRzZJaJf3DPE/NHAFuk/RW1e9PiNjMiYiCpJslvWgeJQ9+LunHks6Z58iLGySdkvTrmdsyT0TEpe6hnFJKY5J+Jum4pLclvZdS+rN3qto5Ahyf8BzfCycpIjZL+r2kH6aU3nfP4xQR35T0r5TSsHuWHGmW9CVJv0gp3SzpP5LW9ddQIuIzmv4M+npJn5V0aUR81ztV7RwBPiHp2qrfX6NV9ClDo0TERk3H98mU0jPueXLgVknfiohRTd+m2hkRv/GOZHdC0omU0uxnR09rOsjr2W5Jf08pnUopnZH0jKSvmWeqmSPAf5X0uYi4PiI+pekb5n8wzJEbERGavq83klJ6xD1PHqSUfppSuialVND0e+SFlNKqubJphJTSSUlvRcTnZ57aJemocaQ8OC7plohonTmPdmkVfWGyeaU3mFKaiogfSPqTpr9i+auU0msrPUfO3Crpe5KORMQrM88dSCkN+EZCTvVKenLm4uUNSd83z2OVUnoxIp6W9JKmv5voZa2iH0nmR5EBwISfhAMAEwIMACYEGABMCDAAmBBgADAhwABgQoABwOS/2yHVGtk42qoAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# for boxplots see https://en.wikipedia.org/wiki/Interquartile_range (or ask!)\n",
+ "sns.boxplot(x=s1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Heavy-tailed\n",
+ "Distributions with a small but non-negligible amount of observations with high values. Several probability distributions follow this pattern: https://en.wikipedia.org/wiki/Heavy-tailed_distribution#Common_heavy-tailed_distributions.\n",
+ "\n",
+ "We pick the lognormal here: https://en.wikipedia.org/wiki/Log-normal_distribution"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "s2 = np.random.lognormal(5, 1, 10000)\n",
+ "sns.displot(s2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWIAAAD4CAYAAADW1uzrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAPHklEQVR4nO3dX2xUZ3rH8d/D2Oyy9a4KJhsRgzJZeaXVqqratbXask2FELSYoGyl3OxFY19UWikXDg1RmqzW1LY0UtRGIiHuVdRWsdM/e5OsSoiNCm1Xe1M1tbtJYW1iJnhQIXTJGrXExGb95+2Fzzl7PB47NnjmMTPfj2R5zjvnPO/7zMDPZ44PwkIIAgD42eK9AACodQQxADgjiAHAGUEMAM4IYgBwVreenXfu3Bmy2WyZlgIA1WlkZOQXIYQHVnp+XUGczWY1PDx876sCgBpiZldWe55LEwDgjCAGAGcEMQA4I4gBwBlBDADOCGIAcEYQA4AzghgAnBHEAOCMIAYAZwQxADgjiAHAGUEMAM4IYgBwRhADgDOCGACcEcQA4IwgBgBnBDEAOFvX/1m3Ufr6+pTP55eMXbt2TZLU1NS04nHNzc3q7Ows69oAoNJcgjifz+u9C2Oa/8KOZCzz6f9Jkv7nTuklZT69WZG1AUCluQSxJM1/YYemv3Y42d52cVCSloylxc8DQLXhGjEAOCOIAcAZQQwAzghiAHBGEAOAM4IYAJwRxADgjCAGAGcEMQA4I4gBwBlBDADOCGIAcEYQA4AzghgAnBHEAOCMIAYAZwQxADgjiAHAGUEMAM4IYgBwRhADgDOCGACcEcQA4IwgBgBnBDEAOCOIAcAZQQwAzghiAHBGEAOAM4IYAJwRxADgjCAGAGcEMQA4I4gBwBlBDADOCGIAcEYQA4AzghgAnBHEAOCMIAYAZwQxADirSBD39fWpr6+vElOtyWZbD4DaVleJSfL5fCWmWbPNth4AtY1LEwDgjCAGAGcEMQA4I4gBwBlBDADOCGIAcEYQA4AzghgAnBHEAOCMIAYAZwQxADgjiAHAGUEMAM4IYgBwRhADgDOCGACcEcQA4IwgBgBnBDEAOCOIAcAZQQwAzghiAHBGEAOAM4IYAJwRxADgjCAGAGcEMQA4I4gBwBlBDADOCGIAcEYQA4AzghgAnBHEAOCMIAYAZwQxADgjiAHAGUEMAM4IYgBwVue9AA+3bt3SxMSE9u3b572Ue5LJZDQ/P598l6T6+notLCwsGX/iiSf01ltvJdshBGUyGS0sLGjHjh2anJzUs88+q7179+r555/X1atXNTs7q/n5eT355JMaGRnRzMyMrl+/rqeeekqvvPKKXnrpJbW0tEiSJicn1dvbq/b2dh0/fly7du1SXV2d5ubmdP36dfX19UmSOjs7tWfPHr344ouamJjQc889p4ceekg3b95UX1+fmpubl9Tq7u7WyZMnk/Guri7duXMnqbl9+3Z1dXVpfn5eCwsL2rp1q44dO6YTJ04ohKBcLidJ6u3tVXd3txobG5P66e24RiaTUS6XU2NjY9LX8ePHk96L1/j000/r1VdfTWrF4uPiNaTrpecufu3S/W4G6fVKWrb2WlHqfdtoNXlGfOXKFe8lbIg4fOPvkpIATY+/+eabCiFobm5OIYTkuRCCJicnJUknTpxQf3+/8vm8ZmZmkmPfeOMNjY6O6vLly5qentbLL7+shYWF5C+nJPX39+v8+fPq6enR9PS0Ll++rPHx8eSYXC6nXC6n6elpjY+Pa2BgQD09PQoh6Nq1a8k+xbVu3769ZHxsbGxJzXhsfHxc+Xxeo6OjyuVyGh0d1djYmAYGBpJ6AwMDS+qnt+Ma8THpvtK9F68xl8stqVV8XKl6xfuv1O9mkF5vqbXXikr0XnNBPDw8rIWFBe9lbDohBJ06dWpN+0nS1NSURkZGNDk5qTNnziiEoKmpqZLHFAoFFQqFZPv06dPL9i0UChoZGVlWKx4fGhpatv8777xTcq7Y4OBgUu/MmTPK5/PLtovrDg0NaXJyMulrtTUWCoWkVvwDrfi44nrp/Uu9doVCQfl8vuTrWEnptQ0NDS1be60o9b6VQ0UuTcRnPUePHpUk5fN5bfllWFeNLTO3lM9/ktS4W+fPn7+n4/Er3d3d2r9//7p/sKXP4IvrlarV3d2tubm5NdeJzc7OysySfXO5XFI/3i6uOzs7q4GBAYUQNDs7u6Y1zs/Pa2BgQM8884z6+/uXHJeul567eCwtl8vp9ddfX7W3cuvv70/Wlu4n3WstSL8O5ez9M8+Izex7ZjZsZsMff/zxhi+g0jgb3jhTU1M6d+5cyZC823qlak1NTSVn4usVHzc3N6dCoZDUj7eL64YQdPbsWZ07d67knKXWODc3p7Nnz0rSsuPS9dJzF4+lpc/qvaTXFkJY8jrGvdaCUu9bOXzmGXEI4TVJr0lSa2vrXf1taGpqkiSdPHlSknT06FGNXP75umosfP5Lav7Kg0mNu3XkyJEVP0JjfRoaGrR//34NDg5uSBg3NDRoZmZmWa2Ghgbdvn37rsLYzBRCUF1dnXbv3q2rV69qbm4u2b5y5cqSumamgwcPKoSgt99+e9mcpdZYV1engwcPSpIOHDiw5Lh0vfh1ivdPj6Vls9l197nRDhw4kKwt/lQRv45xr7Ug/TqUs/eau0bc09PjvYSq0dvbq46ODm3Zsr4/RplMZsV6pWr19vaqrm75OcNKdWL19fWqr69P9u3q6krqx9vFdevr69Xe3q6Ojo7k2M9aYyaTUXt7uyQtOy5dLz138VhaV1fXqn1VQnptxa9j3GstKPW+lUPNBXFra+u6g6MWmJkef/zxNe0nLZ4ZtrS0qLGxUYcOHZKZqaGhoeQx2Wx2yVnekSNHlu2bzWbV0tKyrFY83tbWtmz/xx57rORcscOHDyf1Dh06pObm5mXbxXXb2trU2NiY9LXaGrPZbFIrvq2p+Ljieun9S7122Wx2U9y+ll5bW1vbsrXXilLvWznU5H3EDz/8sCYmJryXcc828j7iY8eOae/evRodHV3TfcS9vb3JOjo6OlQoFFa8jzg+w4vvI25vb9ejjz665D7ieJ90re7u7iXjly5dSu4j7urq0vbt23Xp0qUV7yOOz17iesX103Xj1yt9xtPR0ZHczpfuI64R30dcfJYUH5deQ6m5V+t3Myheb/Haa0Wp922j2Xquu7W2tobh4eF1TxLf6VB8jXj6a4eTfbZdHJSkJWNp2y4OqmUDrhGXWg8AlJOZjYQQWld6ns/oAOCMIAYAZwQxADgjiAHAGUEMAM4IYgBwRhADgDOCGACcEcQA4IwgBgBnBDEAOCOIAcAZQQwAzghiAHBGEAOAM4IYAJwRxADgjCAGAGcEMQA4I4gBwBlBDADOCGIAcEYQA4AzghgAnBHEAOCMIAYAZwQxADgjiAHAGUEMAM4IYgBwRhADgDOCGACcEcQA4IwgBgBnBDEAOCOIAcAZQQwAzghiAHBWV4lJmpubKzHNmm229QCobRUJ4s7OzkpMs2abbT0AahuXJgDAGUEMAM4IYgBwRhADgDOCGACcEcQA4IwgBgBnBDEAOCOIAcAZQQwAzghiAHBGEAOAM4IYAJwRxADgjCAGAGcEMQA4I4gBwBlBDADOCGIAcEYQA4AzghgAnBHEAOCMIAYAZwQxADgjiAHAGUEMAM4IYgBwRhADgDOCGACcEcQA4IwgBgBnBDEAOCOIAcAZQQwAzghiAHBGEAOAM4IYAJwRxADgjCAGAGcEMQA4q/OaOPPpTW27OJjanpSkJWPF+0sPVmJpAFBRLkHc3Ny8bOzatTlJUlPTSmH7YMnjAOB+5xLEnZ2dHtMCwKbENWIAcEYQA4AzghgAnBHEAOCMIAYAZwQxADgjiAHAGUEMAM4IYgBwRhADgDOCGACcEcQA4IwgBgBnBDEAOCOIAcAZQQwAzghiAHBGEAOAM4IYAJwRxADgzEIIa9/Z7GNJV+5yrp2SfnGXx96varFnib5rTS32vd6eHw4hPLDSk+sK4nthZsMhhNaKTLZJ1GLPEn17r6PSarHvje6ZSxMA4IwgBgBnlQzi1yo412ZRiz1L9F1rarHvDe25YteIAQClcWkCAJwRxADgrOxBbGaHzOwDM8ub2Qvlnq/czOxvzOyGmV1Ije0ws7Nmdin6vj313Pej3j8wsz9IjbeY2fnouVfNzCrdy1qZ2R4z+1czGzOzn5nZ0Wi82vv+vJm9a2bvR333RuNV3bckmVnGzH5qZqej7VrouRCt9z0zG47GKtN3CKFsX5Iykj6U9BVJWyW9L+nr5Zyz3F+Sfk/SNyRdSI39haQXoscvSPrz6PHXo54/J+mR6LXIRM+9K+l3JJmkIUlt3r2t0vMuSd+IHn9R0njUW7X3bZIaosf1kv5d0reqve9ovcck/b2k07XwZzxab0HSzqKxivRd7jPib0rKhxAuhxB+KemHkr5T5jnLKoTwE0k3i4a/I6k/etwv6Q9T4z8MIdwJIUxIykv6ppntkvSlEMK/hcV3biB1zKYTQrgeQvjP6PEnksYkNan6+w4hhKlosz76Cqryvs1st6THJP1Variqe15FRfoudxA3Sfrv1PbVaKzaPBhCuC4thpakL0fjK/XfFD0uHt/0zCwr6be1eHZY9X1HH9Hfk3RD0tkQQi30/YqkP5W0kBqr9p6lxR+y/2RmI2b2vWisIn3X3ePCP0upayO1dL/cSv3fl6+LmTVIelPSn4QQbq1y6atq+g4hzEv6LTP7dUk/MrPfWGX3+75vMzsi6UYIYcTM9q3lkBJj91XPKd8OIXxkZl+WdNbMLq6y74b2Xe4z4quS9qS2d0v6qMxzevh59JFE0fcb0fhK/V+NHhePb1pmVq/FEP67EMJb0XDV9x0LIfyvpB9LOqTq7vvbkh43s4IWLyXuN7O/VXX3LEkKIXwUfb8h6UdavLRakb7LHcT/IemrZvaImW2V9F1Jp8o8p4dTkjqixx2S/jE1/l0z+5yZPSLpq5LejT7ifGJm34p+o9qeOmbTidb415LGQggnUk9Ve98PRGfCMrNtkg5Iuqgq7juE8P0Qwu4QQlaLf1//JYTwR6riniXJzH7NzL4YP5b0+5IuqFJ9V+A3kYe1+Fv2DyX9oBK//SxzP/8g6bqkWS3+9PtjSY2S/lnSpej7jtT+P4h6/0Cp355Kao3e6A8l/aWif+W4Gb8k/a4WP179l6T3oq/DNdD3b0r6adT3BUl/Fo1Xdd+pNe/Tr+6aqOqetXhn1/vR18/irKpU3/wTZwBwxr+sAwBnBDEAOCOIAcAZQQwAzghiAHBGEAOAM4IYAJz9P7PtgOygI3TYAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.boxplot(x=s2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Why \"lognormal\"?\n",
+ "\n",
+ "sns.displot(np.log(s2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Box plots\n",
+ "\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Outliers, missing values\n",
+ "\n",
+ "An *outlier* is an observation far from the center of mass of the distribution. It might be an error or a genuine observation: this distinction requires domain knowledge. Outliers infuence the outcomes of several statistics and machine learning methods: it is important to decide how to deal with them.\n",
+ "\n",
+ "A *missing value* is an observation without a value. There can be many reasons for a missing value: the value might not exist (hence its absence is informative and it should be left empty) or might not be known (hence the value is existing but missing in the dataset and it should be marked as NA).\n",
+ "\n",
+ "*One way to think about the difference is with this Zen-like koan: An explicit missing value is the presence of an absence; an implicit missing value is the absence of a presence.*"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Summary statistics\n",
+ "A statistic is a function of a collection of observations, or otherwise stated a measure over a distribution. \n",
+ "\n",
+ "A statistic is said to be *robust* if not sensitive to outliers.\n",
+ "\n",
+ "* Not robust: min, max, mean, standard deviation.\n",
+ "* Robust: mode, median, other quartiles.\n",
+ "\n",
+ "A closer look at the mean:\n",
+ "\n",
+ "$\\bar{x} = \\frac{1}{n} \\sum_{i}x_i$\n",
+ "\n",
+ "And variance (the standard deviation is the square root of the variance):\n",
+ "\n",
+ "$Var(x) = \\frac{1}{n} \\sum_{i}(x_i - \\bar{x})^2$\n",
+ "\n",
+ "The mean, the median, etc. are measures of location (e.g., the typical value); the variance is a measure of dispersion."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "4.993993145847454\n",
+ "240.65275836549188\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Not robust: min, max, mean, mode, standard deviation\n",
+ "\n",
+ "print(np.mean(s1)) # should be 5\n",
+ "print(np.mean(s2))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4.993993145847454"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.mean(s1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "4.990826061241218\n",
+ "147.72833254812608\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Robust: median, other quartiles\n",
+ "\n",
+ "print(np.quantile(s1, 0.5)) # should coincide with mean and mode\n",
+ "print(np.quantile(s2, 0.5))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Questions\n",
+ "\n",
+ "* Calculate the min, max, mode and sd. *hint: explore the numpy documentation!*\n",
+ "* Calculate the 90% quantile values.\n",
+ "* Consider our normally distributed data in s1. Add an outlier (e.g., value 100). What happens to the mean and mode? Write down your answer and then check."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " annual_salary \n",
+ " a_age \n",
+ " length \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " 7870.000000 \n",
+ " 9303.000000 \n",
+ " 9645.000000 \n",
+ " \n",
+ " \n",
+ " mean \n",
+ " 5.916921 \n",
+ " 14.266688 \n",
+ " 5.005694 \n",
+ " \n",
+ " \n",
+ " std \n",
+ " 6.985214 \n",
+ " 2.902770 \n",
+ " 1.462343 \n",
+ " \n",
+ " \n",
+ " min \n",
+ " 0.166667 \n",
+ " 1.000000 \n",
+ " 0.083333 \n",
+ " \n",
+ " \n",
+ " 25% \n",
+ " 3.000000 \n",
+ " 12.000000 \n",
+ " 4.000000 \n",
+ " \n",
+ " \n",
+ " 50% \n",
+ " 4.000000 \n",
+ " 14.000000 \n",
+ " 5.000000 \n",
+ " \n",
+ " \n",
+ " 75% \n",
+ " 6.000000 \n",
+ " 16.000000 \n",
+ " 6.000000 \n",
+ " \n",
+ " \n",
+ " max \n",
+ " 180.000000 \n",
+ " 50.000000 \n",
+ " 15.000000 \n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ " annual_salary a_age length\n",
+ "count 7870.000000 9303.000000 9645.000000\n",
+ "mean 5.916921 14.266688 5.005694\n",
+ "std 6.985214 2.902770 1.462343\n",
+ "min 0.166667 1.000000 0.083333\n",
+ "25% 3.000000 12.000000 4.000000\n",
+ "50% 4.000000 14.000000 5.000000\n",
+ "75% 6.000000 16.000000 6.000000\n",
+ "max 180.000000 50.000000 15.000000"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Let's explore our dataset\n",
+ "df_contracts[[\"annual_salary\",\"a_age\",\"length\"]].describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Relating two variables\n",
+ "\n",
+ "### Covariance\n",
+ "\n",
+ "Measure of association, specifically of the joint linear variability of two variables:\n",
+ "\n",
+ " \n",
+ "\n",
+ "Its normalized version is called the (Pearson's) correlation coefficient:\n",
+ "\n",
+ " \n",
+ "\n",
+ "Correlation is helpful to spot possible relations, but is of tricky interpretation and is not exhaustive:\n",
+ "\n",
+ " \n",
+ "\n",
+ "See: https://en.wikipedia.org/wiki/Covariance and https://en.wikipedia.org/wiki/Pearson_correlation_coefficient.\n",
+ "\n",
+ "*Note: correlation is not causation!*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " annual_salary \n",
+ " a_age \n",
+ " length \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " annual_salary \n",
+ " 1.000000 \n",
+ " 0.205404 \n",
+ " -0.361611 \n",
+ " \n",
+ " \n",
+ " a_age \n",
+ " 0.205404 \n",
+ " 1.000000 \n",
+ " -0.430062 \n",
+ " \n",
+ " \n",
+ " length \n",
+ " -0.361611 \n",
+ " -0.430062 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ " annual_salary a_age length\n",
+ "annual_salary 1.000000 0.205404 -0.361611\n",
+ "a_age 0.205404 1.000000 -0.430062\n",
+ "length -0.361611 -0.430062 1.000000"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_contracts[[\"annual_salary\",\"a_age\",\"length\"]].corr()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.scatterplot(x=df_contracts.length,y=df_contracts.annual_salary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Other ways to measure correlation exist. For example, if you are interested into how one variable will increase (or decrease) as another variable increases (or decreases), the *Spearmanās or Kendallās rank correlation coefficients* might work well."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Questions\n",
+ "\n",
+ "* Try to explore the correlation of other variables in the dataset.\n",
+ "* Can you think of a possible motivation for the trend we see: older apprentices with a shorter contract getting on average a higher annual salary?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Sampling and uncertainty\n",
+ "\n",
+ "Often, we work with samples and we want the sample to be representative of the population it is taken from, in order to draw conclusions that generalise from the sample to the full population.\n",
+ "\n",
+ "Sampling is *tricky*. Samples have *variance* (variation between samples from the same population) and *bias* (systematic variation from the population)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Further reading\n",
+ "\n",
+ "* For a complementary introduction to statistics and data analysis, see https://www.humanitiesdataanalysis.org/statistics-essentials/notebook.html.\n",
+ "* Related to statistics and data analysis is the realm of probability theory, which allows us to formally model and calculate the likelihood of events. For an introduction, see https://www.humanitiesdataanalysis.org/intro-probability/notebook.html."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Part 2: working with texts\n",
+ "\n",
+ "Let's get some basics (or a refresher) of working with texts in Python. Texts are sequences of discrete symbols (words or, more generically, tokens).\n",
+ "\n",
+ "Key challenge: representing text for further processing. Two mainstream approaches:\n",
+ "* *Bag of words*: a text is a collection of tokens occurring with a certain frequence and assumed independently from each other within the text. The mapping from texts to features is determinsitic and straighforward, each text is represented as a vector of the size of the vocabulary.\n",
+ "* *Embeddings*: a method is used (typically, neural networks), to learn a mapping from each token to a (usually small) vector representing it. A text can be represented in turn as an aggregation of these embeddings."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import the dataset\n",
+ "Let us import the Elon Musk's tweets dataset in memory.\n",
+ "\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "root_folder = \"../data/musk_tweets\"\n",
+ "df_elon = pd.read_csv(codecs.open(os.path.join(root_folder,\"elonmusk_tweets.csv\"), encoding=\"utf8\"), sep=\",\")\n",
+ "df_elon['text'] = df_elon['text'].str[1:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " created_at \n",
+ " text \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 849636868052275200 \n",
+ " 2017-04-05 14:56:29 \n",
+ " 'And so the robots spared humanity ... https:/... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 848988730585096192 \n",
+ " 2017-04-03 20:01:01 \n",
+ " \"@ForIn2020 @waltmossberg @mims @defcon_5 Exac... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 848943072423497728 \n",
+ " 2017-04-03 16:59:35 \n",
+ " '@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 848935705057280001 \n",
+ " 2017-04-03 16:30:19 \n",
+ " 'Stormy weather in Shortville ...' \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 848416049573658624 \n",
+ " 2017-04-02 06:05:23 \n",
+ " \"@DaveLeeBBC @verge Coal is dying due to nat g... \n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ " id created_at \\\n",
+ "0 849636868052275200 2017-04-05 14:56:29 \n",
+ "1 848988730585096192 2017-04-03 20:01:01 \n",
+ "2 848943072423497728 2017-04-03 16:59:35 \n",
+ "3 848935705057280001 2017-04-03 16:30:19 \n",
+ "4 848416049573658624 2017-04-02 06:05:23 \n",
+ "\n",
+ " text \n",
+ "0 'And so the robots spared humanity ... https:/... \n",
+ "1 \"@ForIn2020 @waltmossberg @mims @defcon_5 Exac... \n",
+ "2 '@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
+ "3 'Stormy weather in Shortville ...' \n",
+ "4 \"@DaveLeeBBC @verge Coal is dying due to nat g... "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_elon.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2819, 3)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_elon.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Natural Language Processing in Python"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# import some of the most popular libraries for NLP in Python\n",
+ "import spacy\n",
+ "import nltk\n",
+ "import string\n",
+ "import sklearn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt to\n",
+ "[nltk_data] /Users/giovannicolavizza/nltk_data...\n",
+ "[nltk_data] Unzipping tokenizers/punkt.zip.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nltk.download('punkt')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A typical NLP pipeline might look like the following:\n",
+ " \n",
+ " \n",
+ "\n",
+ "### Tokenization: splitting a text into constituent tokens"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nltk.tokenize import TweetTokenizer, word_tokenize\n",
+ "tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\"@ForIn2020 @waltmossberg @mims @defcon_5 Exactly. Tesla is absurdly overvalued if based on the past, but that's irr\\xe2\\x80\\xa6 https://t.co/qQcTqkzgMl\"\n"
+ ]
+ }
+ ],
+ "source": [
+ "example_tweet = df_elon.text[1]\n",
+ "print(example_tweet)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['\"', '@ForIn2020', '@waltmossberg', '@mims', '@defcon_5', 'Exactly', '.', 'Tesla', 'is', 'absurdly', 'overvalued', 'if', 'based', 'on', 'the', 'past', ',', 'but', \"that's\", 'irr', '\\\\', 'xe2', '\\\\', 'x80', '\\\\', 'xa6', 'https://t.co/qQcTqkzgMl', '\"']\n",
+ "['``', '@', 'ForIn2020', '@', 'waltmossberg', '@', 'mims', '@', 'defcon_5', 'Exactly', '.', 'Tesla', 'is', 'absurdly', 'overvalued', 'if', 'based', 'on', 'the', 'past', ',', 'but', 'that', \"'s\", 'irr\\\\xe2\\\\x80\\\\xa6', 'https', ':', '//t.co/qQcTqkzgMl', \"''\"]\n"
+ ]
+ }
+ ],
+ "source": [
+ "tkz1 = tknzr.tokenize(example_tweet)\n",
+ "print(tkz1)\n",
+ "tkz2 = word_tokenize(example_tweet)\n",
+ "print(tkz2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Question: can you spot what the Twitter tokenizer is doing instead of a standard one?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "string.punctuation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# some more pre-processing\n",
+ "\n",
+ "def filter(tweet):\n",
+ " \n",
+ " # remove punctuation and short words and urls\n",
+ " tweet = [t for t in tweet if t not in string.punctuation and len(t) > 3 and not t.startswith(\"http\")]\n",
+ " return tweet\n",
+ "\n",
+ "def tokenize_and_string(tweet):\n",
+ " \n",
+ " tkz = tknzr.tokenize(tweet)\n",
+ " \n",
+ " tkz = filter(tkz)\n",
+ " \n",
+ " return \" \".join(tkz)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['\"', '@ForIn2020', '@waltmossberg', '@mims', '@defcon_5', 'Exactly', '.', 'Tesla', 'is', 'absurdly', 'overvalued', 'if', 'based', 'on', 'the', 'past', ',', 'but', \"that's\", 'irr', '\\\\', 'xe2', '\\\\', 'x80', '\\\\', 'xa6', 'https://t.co/qQcTqkzgMl', '\"']\n",
+ "['@ForIn2020', '@waltmossberg', '@mims', '@defcon_5', 'Exactly', 'Tesla', 'absurdly', 'overvalued', 'based', 'past', \"that's\"]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(tkz1)\n",
+ "print(filter(tkz1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_elon[\"clean_text\"] = df_elon[\"text\"].apply(tokenize_and_string)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " created_at \n",
+ " text \n",
+ " clean_text \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 849636868052275200 \n",
+ " 2017-04-05 14:56:29 \n",
+ " 'And so the robots spared humanity ... https:/... \n",
+ " robots spared humanity \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 848988730585096192 \n",
+ " 2017-04-03 20:01:01 \n",
+ " \"@ForIn2020 @waltmossberg @mims @defcon_5 Exac... \n",
+ " @ForIn2020 @waltmossberg @mims @defcon_5 Exact... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 848943072423497728 \n",
+ " 2017-04-03 16:59:35 \n",
+ " '@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
+ " @waltmossberg @mims @defcon_5 Walt \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 848935705057280001 \n",
+ " 2017-04-03 16:30:19 \n",
+ " 'Stormy weather in Shortville ...' \n",
+ " Stormy weather Shortville \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 848416049573658624 \n",
+ " 2017-04-02 06:05:23 \n",
+ " \"@DaveLeeBBC @verge Coal is dying due to nat g... \n",
+ " @DaveLeeBBC @verge Coal dying fracking It's ba... \n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ " id created_at \\\n",
+ "0 849636868052275200 2017-04-05 14:56:29 \n",
+ "1 848988730585096192 2017-04-03 20:01:01 \n",
+ "2 848943072423497728 2017-04-03 16:59:35 \n",
+ "3 848935705057280001 2017-04-03 16:30:19 \n",
+ "4 848416049573658624 2017-04-02 06:05:23 \n",
+ "\n",
+ " text \\\n",
+ "0 'And so the robots spared humanity ... https:/... \n",
+ "1 \"@ForIn2020 @waltmossberg @mims @defcon_5 Exac... \n",
+ "2 '@waltmossberg @mims @defcon_5 Et tu, Walt?' \n",
+ "3 'Stormy weather in Shortville ...' \n",
+ "4 \"@DaveLeeBBC @verge Coal is dying due to nat g... \n",
+ "\n",
+ " clean_text \n",
+ "0 robots spared humanity \n",
+ "1 @ForIn2020 @waltmossberg @mims @defcon_5 Exact... \n",
+ "2 @waltmossberg @mims @defcon_5 Walt \n",
+ "3 Stormy weather Shortville \n",
+ "4 @DaveLeeBBC @verge Coal dying fracking It's ba... "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_elon.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save cleaned up version\n",
+ "\n",
+ "df_elon.to_csv(os.path.join(root_folder,\"df_elon.csv\"), index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Building a dictionary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2819, 7864)"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "count_vect = CountVectorizer(lowercase=False, tokenizer=tknzr.tokenize)\n",
+ "X_count = count_vect.fit_transform(df_elon.clean_text)\n",
+ "X_count.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "6617"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "word_list = count_vect.get_feature_names_out() \n",
+ "count_list = X_count.toarray().sum(axis=0)\n",
+ "dictionary = dict(zip(word_list,count_list))\n",
+ "count_vect.vocabulary_.get(\"robots\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_count[:,count_vect.vocabulary_.get(\"robots\")].toarray().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dictionary[\"robots\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Questions\n",
+ "\n",
+ "* Find the tokens most used by Elon.\n",
+ "* Find the twitter users most referred to by Elon (hint: use the @ handler to spot them)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('Tesla', 322),\n",
+ " ('Model', 236),\n",
+ " ('that', 223),\n",
+ " ('will', 218),\n",
+ " ('with', 177),\n",
+ " ('@SpaceX', 169),\n",
+ " ('from', 163),\n",
+ " ('this', 159),\n",
+ " ('@TeslaMotors', 149),\n",
+ " ('launch', 124)]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dictionary_list = sorted(dictionary.items(), key=lambda x:x[1], reverse=True)\n",
+ "[d for d in dictionary_list][:10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('@SpaceX', 169),\n",
+ " ('@TeslaMotors', 149),\n",
+ " ('@elonmusk', 85),\n",
+ " ('@NASA', 48),\n",
+ " ('@Space_Station', 19),\n",
+ " ('@FredericLambert', 17),\n",
+ " ('@ID_AA_Carmack', 15),\n",
+ " ('@WIRED', 14),\n",
+ " ('@vicentes', 14),\n",
+ " ('@BadAstronomer', 11)]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[d for d in dictionary_list if d[0].startswith('@')][:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Representing tweets as vectors\n",
+ "\n",
+ "Texts are of variable length and need to be represented numerically in some way. Most typically, we represent them as *equally-sized vectors*.\n",
+ "\n",
+ "Actually, this is what we have already done! Let's take a closer look at `X_count` above.."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 849636868052275200\n",
+ "created_at 2017-04-05 14:56:29\n",
+ "text 'And so the robots spared humanity ... https:/...\n",
+ "clean_text robots spared humanity\n",
+ "Name: 0, dtype: object"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# This is the first Tweet of the data frame\n",
+ "\n",
+ "df_elon.loc[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# let's get the vector representation for this Tweet\n",
+ "\n",
+ "vector_representation = X_count[0,:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# there are 3 positions not to zero, as we would expect: the vector contains 1 in the columns related to the 3 words that make up the Tweet. \n",
+ "# It would contain a number higher than 1 if a given word were occurring multiple times.\n",
+ "\n",
+ "np.sum(vector_representation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1\n",
+ "1\n",
+ "1\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Let's check that indeed the vector contains 1s for the right words\n",
+ "# Remember, the vector has shape (1 x size of the vocabulary)\n",
+ "\n",
+ "print(vector_representation[0,count_vect.vocabulary_.get(\"robots\")])\n",
+ "print(vector_representation[0,count_vect.vocabulary_.get(\"spared\")])\n",
+ "print(vector_representation[0,count_vect.vocabulary_.get(\"humanity\")])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Term Frequency - Inverse Document Frequency\n",
+ "We can use boolean counts (1/0) and raw counts (as we did before) to represent a Tweet over the space of the vocabulary, but there exist improvements on this basic idea. For example, the TF-IDF weighting scheme:\n",
+ "\n",
+ "$tfidf(t, d, D) = tf(t, d) \\cdot idf(t, D)$\n",
+ "\n",
+ "$tf(t, d) = f_{t,d}$\n",
+ "\n",
+ "$idf(t, D) = log \\Big( \\frac{|D|}{|{d \\in D: t \\in d}|} \\Big)$"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2819, 7864)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "count_vect = TfidfVectorizer(lowercase=False, tokenizer=tknzr.tokenize)\n",
+ "X_count_tfidf = count_vect.fit_transform(df_elon.clean_text)\n",
+ "X_count_tfidf.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.7226760995112569"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_count_tfidf[0,:].sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_count[0,:].sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Sparse vectors (mention)\n",
+ "How is Python representing these vectors in memory? Most of their cells are set to zero. \n",
+ "\n",
+ "We call any vector or matrix whose cells are mostly to zero *sparse*.\n",
+ "There are efficient ways to store them in memory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<1x7864 sparse matrix of type ''\n",
+ "\twith 3 stored elements in Compressed Sparse Row format>"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_count_tfidf[0,:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Spacy pipelines\n",
+ "\n",
+ "Useful to construct sequences of pre-processing steps: https://spacy.io/usage/processing-pipelines."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load a pre-trained pipeline (Web Small): https://spacy.io/usage/models\n",
+ "\n",
+ "#!python -m spacy download en_core_web_sm\n",
+ "nlp = spacy.load('en_core_web_sm')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*.. the modelās meta.json tells spaCy to use the language \"en\" and the pipeline [\"tagger\", \"parser\", \"ner\"]. spaCy will then initialize spacy.lang.en.English, and create each pipeline component and add it to the processing pipeline. Itāll then load in the modelās data from its data directory and return the modified Language class for you to use as the nlp object.*\n",
+ "\n",
+ "Let's create a simple pipeline that does **lemmatization**, **part of speech tagging** and **named entity recognition** using spaCy models.\n",
+ "\n",
+ "*If you don't know what these NLP tasks are, please ask!*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tweet_pos = list()\n",
+ "tweet_ner = list()\n",
+ "tweet_lemmas = list()\n",
+ "\n",
+ "for tweet in df_elon.text.values:\n",
+ " spacy_tweet = nlp(tweet)\n",
+ " \n",
+ " local_tweet_pos = list()\n",
+ " local_tweet_ner = list()\n",
+ " local_tweet_lemmas = list()\n",
+ " \n",
+ " for sentence in list(spacy_tweet.sents):\n",
+ " # --- lemmatization, remove punctuation and stop wors\n",
+ " local_tweet_lemmas.extend([token.lemma_ for token in sentence if not token.is_punct | token.is_stop])\n",
+ " local_tweet_pos.extend([token.pos_ for token in sentence if not token.is_punct | token.is_stop])\n",
+ " for ent in spacy_tweet.ents:\n",
+ " local_tweet_ner.append(ent)\n",
+ "\n",
+ " tweet_pos.append(local_tweet_pos)\n",
+ " tweet_ner.append(local_tweet_ner)\n",
+ " tweet_lemmas.append(local_tweet_lemmas)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['robot', 'spare', 'humanity', 'https://t.co/v7JUJQWfCv']"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tweet_lemmas[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['NOUN', 'VERB', 'NOUN', 'NOUN']"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tweet_pos[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[https://t.co/v7JUJQWfCv]"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tweet_ner[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Shortville]"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# but it actually works!\n",
+ "\n",
+ "tweet_ner[3]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*Note: we are really just scratching the surface of spaCy, but it is worth knowing it's there.*"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Searching tweets\n",
+ "\n",
+ "Once we have represented Tweets as vectors, we can easily find similar ones using basic operations such as filtering."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "robots spared humanity\n"
+ ]
+ }
+ ],
+ "source": [
+ "target = 0\n",
+ "print(df_elon.clean_text[target])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "condition = X_count_tfidf[target,:] > 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " (0, 5198)\tTrue\n",
+ " (0, 6617)\tTrue\n",
+ " (0, 6949)\tTrue\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(condition)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_filtered = X_count_tfidf[:,np.ravel(condition.toarray())]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<2819x3 sparse matrix of type ''\n",
+ "\twith 16 stored elements in Compressed Sparse Row format>"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_filtered"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " (0, 0)\t0.495283407359234\n",
+ " (0, 2)\t0.6406029997190412\n",
+ " (0, 1)\t0.5867896924329815\n",
+ " (217, 0)\t0.2972381925908634\n",
+ " (271, 0)\t0.3284547085372313\n",
+ " (464, 0)\t0.2273880239746895\n",
+ " (473, 0)\t0.5667220639589731\n",
+ " (734, 1)\t0.3846355279044392\n",
+ " (940, 0)\t0.27312597149485407\n",
+ " (1004, 0)\t0.28161575586607157\n",
+ " (1550, 1)\t0.33303254164524276\n",
+ " (1862, 0)\t0.3196675199194523\n",
+ " (2493, 0)\t0.2685018991334563\n",
+ " (2559, 0)\t0.31145247014227906\n",
+ " (2565, 0)\t0.2645117238497897\n",
+ " (2661, 0)\t0.2729016388865858\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(X_filtered)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([ 0, 217, 271, 464, 473, 940, 1004, 1862, 2493, 2559, 2565,\n",
+ " 2661, 0, 734, 1550, 0], dtype=int32),\n",
+ " array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2], dtype=int32),\n",
+ " array([0.49528341, 0.29723819, 0.32845471, 0.22738802, 0.56672206,\n",
+ " 0.27312597, 0.28161576, 0.31966752, 0.2685019 , 0.31145247,\n",
+ " 0.26451172, 0.27290164, 0.58678969, 0.38463553, 0.33303254,\n",
+ " 0.640603 ]))"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from scipy import sparse\n",
+ "\n",
+ "sparse.find(X_filtered)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tweet_indices = list(sparse.find(X_filtered)[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "TARGET: robots spared humanity\n",
+ "1)@JustBe74 important make humanity proud this case particular duty owed American taxpayer\n",
+ "2)@pud Faith restored humanity French toast money\n",
+ "3)humanity have exciting inspiring future cannot confined Earth forever @love_to_dream #APSpaceChat\n",
+ "4)@ShireeshAgrawal like humanity\n",
+ "5)Creating neural lace thing that really matters humanity achieve symbiosis with machines\n",
+ "6)@tzepr Certainly agree that first foremost triumph humanity cheering good spirit\n",
+ "7)@ReesAndersen @FLIxrisk believe that critical ensure good future humanity\n",
+ "8)@NASA #Mars hard x99s worth risks extend humanity x99s frontier beyond Earth Learn about neighbor planet\n",
+ "9)Astronomer Royal Martin Rees soon will robots take over world @Telegraph\n",
+ "10)@thelogicbox @IanrossWins Mars critical long-term survival humanity life Earth know\n",
+ "11)humanity wishes become multi-planet species then must figure move millions people Mars\n",
+ "12)Sure feels weird find myself defending robots\n",
+ "13)Neil Armstrong hero humanity spirit will carry stars\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"TARGET: \" + df_elon.clean_text[target])\n",
+ "\n",
+ "for n, tweet_index in enumerate(list(set(tweet_indices))):\n",
+ " if tweet_index != target:\n",
+ " print(str(n) +\")\"+ df_elon.clean_text[tweet_index])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Questions\n",
+ "\n",
+ "* Can you rank the matched tweets using their tf-idf weights, so to put higher weighted tweets first?\n",
+ "* Which limitations do you think a bag of words representation has?\n",
+ "* Can you spot any limitations of this approach based on similarity measures over bag of words representations?\n",
+ "\n",
+ "#### Exercises\n",
+ "\n",
+ "* Find the highest IDF tokens in the corpus and discuss your results.\n",
+ "* Find the most frequent named entities and discuss your results.\n",
+ "* Are there trends in the topics of tweets during the day? And over time?\n",
+ "* Perform a **sentiment analysis** of the corpus, see here https://spacy.io/universe/project/spacy-textblob"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!pip install spacytextblob\n",
+ "#!python -m spacy download en_core_web_sm\n",
+ "#!python -m textblob.download_corpora"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "-0.125"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import spacy\n",
+ "from spacytextblob.spacytextblob import SpacyTextBlob\n",
+ "\n",
+ "nlp = spacy.load('en_core_web_sm')\n",
+ "nlp.add_pipe('spacytextblob')\n",
+ "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'\n",
+ "doc = nlp(text)\n",
+ "doc._.blob.polarity "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
|