Merge pull request #14 from cortze/dev

Merge CLI-refactoring
cortze · Jan 12, 2023 · 50abb81 · 50abb81
2 parents 527d4aa + 977c3d1
commit 50abb81
Show file tree

Hide file tree

Showing 64 changed files with 11,691 additions and 5,854 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,13 +5,16 @@ build/
 data/*
 
 # Dismiss python cached stuff and venv
+__pycache__/
 .ipynb_checkpoints/
 analyzer/.ipynb_checkpoints/
 analyzer/venv/
+logs/
 
-# Igonore the .vscode configuration
+# Ignore the .vscode configuration
 .vscode
-
+#Ignore .idea 
+.idea
 
 # TODO: update to make file and build folder
-dbs
+dbs
diff --git a/Makefile b/Makefile
@@ -17,7 +17,7 @@ install:
 
 dependencies:
 	$(GIT_SUBM) update --init
-	cd go-libp2p-kad-dht && git checkout cid-hoarder
+	cd go-libp2p-kad-dht && git checkout origin/cid-hoarder
 
 
 clean:

diff --git a/README.md b/README.md
diff --git a/analyzer/CID_distribution.ipynb b/analyzer/CID_distribution.ipynb
diff --git a/analyzer/cid_distribution_in_hash_space.ipynb b/analyzer/cid_distribution_in_hash_space.ipynb
diff --git a/analyzer/cid_hoarder_analyzer.ipynb b/analyzer/cid_hoarder_analyzer.ipynb
diff --git a/analyzer/cid_pinging_phase.ipynb b/analyzer/cid_pinging_phase.ipynb
@@ -0,0 +1,349 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of the CID pinging phase\n",
+    "\n",
+    "  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Import dependencies\n",
+    "import sqlalchemy as sa\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "## DB Credentials\n",
+    "HOST=\"localhost\"\n",
+    "PORT=\"5432\"\n",
+    "DB=\"hoarder_test\"\n",
+    "USER=\"hoarder\"\n",
+    "PASSWD=\"password\"\n",
+    "\n",
+    "# Connecte with the DB\n",
+    "engine = sa.create_engine(f'postgresql://{USER}:{PASSWD}@{HOST}:{PORT}/{DB}')\n",
+    "\n",
+    "## plotting style\n",
+    "fig_size= (7,4)\n",
+    "sns.set_context(\"talk\", font_scale=1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## get the median time of each fetch time\n",
+    "\n",
+    "sql_query=\"\"\"\n",
+    "    SELECT \n",
+    "        cid_hash,\n",
+    "        ping_round, \n",
+    "        fetch_time\n",
+    "    FROM fetch_results\n",
+    "    ORDER BY ping_round;\n",
+    "\"\"\"\n",
+    "ping_rounds = pd.read_sql_query(sql_query, engine)\n",
+    "\n",
+    "avg_fetcht = ping_rounds.groupby(\"ping_round\").mean()\n",
+    "hours_dist = avg_fetcht[\"fetch_time\"].to_numpy()\n",
+    "\n",
+    "hours_dist = (hours_dist - hours_dist[0]) / 3600\n",
+    "print(hours_dist)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Track the Activity or Onliness of those PR Holders\n",
+    "We divide them into:\n",
+    "1. Total PR Holders\n",
+    "2. Only non-hydra PR Holders\n",
+    "3. Only hydra PR Holders"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_ping_dist(pd_obj, column_name, opts):\n",
+    "    ## Get the total active peers distribution per ping_round\n",
+    "    pv_table = pd_obj.pivot(index=[\"ping_round\", \"cid_hash\"], columns=column_name, values=\"count\")\n",
+    "    pv_table = pv_table.fillna(0)\n",
+    "    aux = pd.DataFrame(pv_table.to_records())\n",
+    "\n",
+    "    # make dist\n",
+    "    dist = []\n",
+    "    for i, h in enumerate(hours_dist):\n",
+    "        t = aux.query(f\"ping_round == {i}\")\n",
+    "        dist.append(t[\"True\"])\n",
+    "        \n",
+    "    ## Make a boxplot with the distribution\n",
+    "    fig, ax = plt.subplots(figsize=(12,6))\n",
+    "    ax.boxplot(dist, positions=hours_dist, showfliers=True) \n",
+    "    ticks = np.linspace(0, 36, 10) ###### <---- *UPDATE THIS* to fit the study duration\n",
+    "    plt.xticks(ticks, ticks.astype(int))\n",
+    "    plt.xlabel(\"Time Since Publication (Hours)\")\n",
+    "    plt.ylabel(opts[\"ylabel\"])\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Get the active peers distribution per ping_round\n",
+    "\n",
+    "sql_query = \"\"\"\n",
+    "SELECT \n",
+    "\tping.cid_hash,\n",
+    "\tping.ping_round,\n",
+    "\tping.is_active,\n",
+    "\tcount(ping.is_active)\n",
+    "FROM (\n",
+    "\tSELECT \n",
+    "\t\tpr.cid_hash,\n",
+    "\t\tpr.ping_round,\n",
+    "\t\tpr.is_active, \n",
+    "\t\tpr.has_records,\n",
+    "\t\tpeer_info.client\n",
+    "\tFROM ping_results as pr\n",
+    "\tLEFT JOIN peer_info ON pr.peer_id = peer_info.peer_id\n",
+    "\tORDER BY ping_round ASC\n",
+    ") as ping\n",
+    "GROUP BY cid_hash, ping_round, is_active;\n",
+    "\"\"\"\n",
+    "\n",
+    "ping_rounds = pd.read_sql_query(sql_query, engine)\n",
+    "print(\"\")\n",
+    "plot_ping_dist(ping_rounds, \"is_active\", {\"ylabel\":\"Peers Online\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Get the active Non-Hydras PR Holders distribution per ping_round\n",
+    "\n",
+    "sql_query = \"\"\"\n",
+    "\tSELECT \n",
+    "\t\tping.cid_hash,\n",
+    "\t\tping.ping_round,\n",
+    "\t\tping.is_active,\n",
+    "\t\tcount(ping.is_active)\n",
+    "\tFROM (\n",
+    "\t\tSELECT \n",
+    "\t\t\tpr.cid_hash,\n",
+    "\t\t\tpr.ping_round,\n",
+    "\t\t\tpr.is_active, \n",
+    "\t\t\tpr.has_records,\n",
+    "\t\t\tpeer_info.client\n",
+    "\t\tFROM ping_results as pr\n",
+    "\t\tLEFT JOIN peer_info ON pr.peer_id = peer_info.peer_id\n",
+    "\t\tORDER BY ping_round ASC\n",
+    "\t) as ping\n",
+    "\tWHERE ping.client!='hydra-booster'\n",
+    "\tGROUP BY cid_hash, ping_round, is_active;\n",
+    "\"\"\"\n",
+    "\n",
+    "pings = pd.read_sql_query(sql_query, engine)\n",
+    "plot_ping_dist(pings, \"is_active\", {\"ylabel\":\"Peers Online\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Get the active Hydras PR Holders distribution per ping_round\n",
+    "\n",
+    "sql_query = \"\"\"\n",
+    "\tSELECT \n",
+    "\t\tping.cid_hash,\n",
+    "\t\tping.ping_round,\n",
+    "\t\tping.is_active,\n",
+    "\t\tcount(ping.is_active)\n",
+    "\tFROM (\n",
+    "\t\tSELECT \n",
+    "\t\t\tpr.cid_hash,\n",
+    "\t\t\tpr.ping_round,\n",
+    "\t\t\tpr.is_active, \n",
+    "\t\t\tpr.has_records,\n",
+    "\t\t\tpeer_info.client\n",
+    "\t\tFROM ping_results as pr\n",
+    "\t\tLEFT JOIN peer_info ON pr.peer_id = peer_info.peer_id\n",
+    "\t\tORDER BY ping_round ASC\n",
+    "\t) as ping\n",
+    "\tWHERE ping.client='hydra-booster'\n",
+    "\tGROUP BY cid_hash, ping_round, is_active;\n",
+    "\"\"\"\n",
+    "\n",
+    "pings = pd.read_sql_query(sql_query, engine)\n",
+    "plot_ping_dist(pings, \"is_active\", {\"ylabel\": \"Peers Online\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Track the whether the PR Holders share the PRs\n",
+    "We divide them into:\n",
+    "1. Total PR Holders sharing the PRs\n",
+    "2. Only non-hydra PR Holders sharing the PRs\n",
+    "3. Only hydra PR Holders sharing the PRs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Get the distribution of the PR Holders that share the PRs per ping_round\n",
+    "\n",
+    "sql_query = \"\"\"\n",
+    "SELECT \n",
+    "\tping.cid_hash,\n",
+    "\tping.ping_round,\n",
+    "\tping.has_records,\n",
+    "\tcount(ping.has_records)\n",
+    "FROM (\n",
+    "\tSELECT \n",
+    "\t\tpr.cid_hash,\n",
+    "\t\tpr.ping_round,\n",
+    "\t\tpr.is_active, \n",
+    "\t\tpr.has_records,\n",
+    "\t\tpeer_info.client\n",
+    "\tFROM ping_results as pr\n",
+    "\tLEFT JOIN peer_info ON pr.peer_id = peer_info.peer_id\n",
+    "\tORDER BY ping_round ASC\n",
+    ") as ping\n",
+    "GROUP BY cid_hash, ping_round, has_records;\n",
+    "\"\"\"\n",
+    "\n",
+    "pings = pd.read_sql_query(sql_query, engine)\n",
+    "plot_ping_dist(pings, \"has_records\", {\"ylabel\": \"Peers Sharing PRs\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Get the non-hydra PR Holders sharing the PRs per ping_round\n",
+    "\n",
+    "sql_query = \"\"\"\n",
+    "SELECT \n",
+    "\tping.cid_hash,\n",
+    "\tping.ping_round,\n",
+    "\tping.has_records,\n",
+    "\tcount(ping.has_records)\n",
+    "FROM (\n",
+    "\tSELECT \n",
+    "\t\tpr.cid_hash,\n",
+    "\t\tpr.ping_round,\n",
+    "\t\tpr.is_active, \n",
+    "\t\tpr.has_records,\n",
+    "\t\tpeer_info.client\n",
+    "\tFROM ping_results as pr\n",
+    "\tLEFT JOIN peer_info ON pr.peer_id = peer_info.peer_id\n",
+    "\tORDER BY ping_round ASC\n",
+    ") as ping\n",
+    "WHERE ping.client!='hydra-booster'\n",
+    "GROUP BY cid_hash, ping_round, has_records;\n",
+    "\"\"\"\n",
+    "\n",
+    "pings = pd.read_sql_query(sql_query, engine)\n",
+    "plot_ping_dist(pings, \"has_records\", {\"ylabel\": \"Peers Sharing PRs\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Get the distribution of hydra peers sharing the PRs per ping_round\n",
+    "\n",
+    "sql_query = \"\"\"\n",
+    "SELECT \n",
+    "\tping.cid_hash,\n",
+    "\tping.ping_round,\n",
+    "\tping.has_records,\n",
+    "\tcount(ping.has_records)\n",
+    "FROM (\n",
+    "\tSELECT \n",
+    "\t\tpr.cid_hash,\n",
+    "\t\tpr.ping_round,\n",
+    "\t\tpr.is_active, \n",
+    "\t\tpr.has_records,\n",
+    "\t\tpeer_info.client\n",
+    "\tFROM ping_results as pr\n",
+    "\tLEFT JOIN peer_info ON pr.peer_id = peer_info.peer_id\n",
+    "\tORDER BY ping_round ASC\n",
+    ") as ping\n",
+    "WHERE ping.client='hydra-booster'\n",
+    "GROUP BY cid_hash, ping_round, has_records;\n",
+    "\"\"\"\n",
+    "\n",
+    "pings = pd.read_sql_query(sql_query, engine)\n",
+    "plot_ping_dist(pings, \"has_records\", {\"ylabel\": \"Peers Sharing PRs\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engine.dispose()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('plotter')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "27c6d93b683c7a1975bfd893e997da1d087883bf6b96d34d1e63ecc137ac54d0"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}