diff --git a/docs/tutorials/binning_slowly_changing_sources.ipynb b/docs/tutorials/binning_slowly_changing_sources.ipynb index 767b34c8..c55da0ff 100644 --- a/docs/tutorials/binning_slowly_changing_sources.ipynb +++ b/docs/tutorials/binning_slowly_changing_sources.ipynb @@ -263,15 +263,6 @@ "ax.set_xlabel(\"Time (MJD)\")\n", "ax.set_ylabel(\"Source Count\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ens.client.close() # Tear down the ensemble client" - ] } ], "metadata": { @@ -290,7 +281,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/tutorials/scaling_to_large_data.ipynb b/docs/tutorials/scaling_to_large_data.ipynb index 6624b7e2..09fbd99a 100644 --- a/docs/tutorials/scaling_to_large_data.ipynb +++ b/docs/tutorials/scaling_to_large_data.ipynb @@ -28,9 +28,7 @@ "source": [ "### The `Dask` Client and Scheduler\n", "\n", - "An important aspect of `Dask` to understand for optimizing it's performance for large datasets is the Distributed Client. `Dask` has [thorough documentation](https://distributed.dask.org/en/stable/client.html) on this, but the general idea is that the Distributed Client is the entrypoint for setting up a distributed system. The Distributed Client enables asynchronous computation, where Dask's `compute` and `persist` methods are able to run in the background and persist in memory while we continue doing other work.\n", - "\n", - "In the TAPE `Ensemble`, by default a Distributed Client is spun up in the background, which can be accessed using `Ensemble.client_info()`:\n", + "An important aspect of `Dask` to understand for optimizing it's performance for large datasets is the Distributed Client. `Dask` has [thorough documentation](https://distributed.dask.org/en/stable/client.html) on this, but the general idea is that the Distributed Client is the entrypoint for setting up a distributed system. By default, the Tape `Ensemble` operates without a Distributed Client. In the TAPE `Ensemble`, we can have a Distributed Client spun up in the background by indicating `client=True` when initializing the Ensemble, which can be accessed using `Ensemble.client_info()`:\n", "\n" ] }, @@ -42,7 +40,7 @@ "source": [ "from tape import Ensemble\n", "\n", - "ens = Ensemble()\n", + "ens = Ensemble(client=True)\n", "\n", "ens.client_info()" ] @@ -72,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "ens = Ensemble(n_workers=3, threads_per_worker=2)\n", + "ens = Ensemble(client=True, n_workers=3, threads_per_worker=2)\n", "\n", "ens.client_info()" ] @@ -141,23 +139,6 @@ "This may be preferable for those who want full control of the `Dask` client API, which may be beneficial when working on external machines/services or when a more complex setup is desired." ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alternatively, there may be instances where you would prefer to not use the Distributed Client, particularly when working with smaller amounts of data. In these instances, we allow users to disable the creation of a Distributed Client by passing `client=False`, as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ens=Ensemble(client=False)" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -181,13 +162,15 @@ "ens = Ensemble(client=client)\n", "\n", "# Read in data from a parquet file\n", - "ens.from_parquet(\"../../tests/tape_tests/data/source/test_source.parquet\",\n", - " id_col='ps1_objid',\n", - " time_col='midPointTai',\n", - " flux_col='psFlux',\n", - " err_col='psFluxErr',\n", - " band_col='filterName',\n", - " partition_size='5KB')\n", + "ens.from_parquet(\n", + " \"../../tests/tape_tests/data/source/test_source.parquet\",\n", + " id_col=\"ps1_objid\",\n", + " time_col=\"midPointTai\",\n", + " flux_col=\"psFlux\",\n", + " err_col=\"psFluxErr\",\n", + " band_col=\"filterName\",\n", + " partition_size=\"5KB\",\n", + ")\n", "\n", "ens.info()" ] @@ -211,8 +194,12 @@ "from tape.analysis.stetsonj import calc_stetson_J\n", "import numpy as np\n", "\n", - "mapres = ens.batch(calc_stetson_J, use_map=True) # will not know to look at multiple partitions to get lightcurve data\n", - "groupres = ens.batch(calc_stetson_J, use_map=False) # will know to look at multiple partitions, with shuffling costs\n", + "mapres = ens.batch(\n", + " calc_stetson_J, use_map=True\n", + ") # will not know to look at multiple partitions to get lightcurve data\n", + "groupres = ens.batch(\n", + " calc_stetson_J, use_map=False\n", + ") # will know to look at multiple partitions, with shuffling costs\n", "\n", "print(\"number of lightcurve results in mapres: \", len(mapres))\n", "print(\"number of lightcurve results in groupres: \", len(groupres))\n", @@ -249,7 +236,7 @@ ], "metadata": { "kernelspec": { - "display_name": "py310", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -263,11 +250,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.11" }, "vscode": { "interpreter": { - "hash": "08968836a6367873274ed1d5e98a07391f42fc3a62bd5aba54afbd7b11ba8673" + "hash": "83afbb17b435d9bf8b0d0042367da76f26510da1c5781f0ff6e6c518eab621ec" } } }, diff --git a/docs/tutorials/structure_function_showcase.ipynb b/docs/tutorials/structure_function_showcase.ipynb index f2168f23..d7aafb12 100644 --- a/docs/tutorials/structure_function_showcase.ipynb +++ b/docs/tutorials/structure_function_showcase.ipynb @@ -21,7 +21,7 @@ "import eztao\n", "\n", "import numpy as np\n", - "import matplotlib.pyplot as plt\n" + "import matplotlib.pyplot as plt" ] }, { @@ -76,19 +76,16 @@ "# Generate `num_light_curves` lightcurves\n", "# t, y, yerr are np.ndarray with shape = (num_light_curves, num_observations)\n", "t, y, err = gpSimRand(\n", - " carmaTerm=DRW_kernel,\n", - " SNR=snr,\n", - " duration=duration_in_days,\n", - " N=num_observations,\n", - " nLC=num_light_curves)\n", + " carmaTerm=DRW_kernel, SNR=snr, duration=duration_in_days, N=num_observations, nLC=num_light_curves\n", + ")\n", "\n", "# pick 10 lightcurves at random and plot (with small offset for clarity)\n", "sel_10 = np.random.choice(len(t), size=10, replace=False)\n", - "plt.figure(dpi=150, figsize=(10,5))\n", + "plt.figure(dpi=150, figsize=(10, 5))\n", "for i, j in enumerate(sel_10):\n", - " plt.errorbar(t[j], y[j]+i*0.2, yerr=err[j], ls='', marker='.', ms=4, capsize=2)\n", - "plt.xlabel('time [unit]')\n", - "plt.ylabel('magnitude [unit]')" + " plt.errorbar(t[j], y[j] + i * 0.2, yerr=err[j], ls=\"\", marker=\".\", ms=4, capsize=2)\n", + "plt.xlabel(\"time [unit]\")\n", + "plt.ylabel(\"magnitude [unit]\")" ] }, { @@ -121,13 +118,17 @@ "plt.figure()\n", "for i, j in enumerate(sel_10):\n", " res_basic = tape.analysis.calc_sf2(t[j], y[j], err[j])\n", - " plt.plot(res_basic['dt'], res_basic['sf2'], marker='.',)\n", - "\n", - "plt.yscale('log')\n", - "plt.xscale('log')\n", + " plt.plot(\n", + " res_basic[\"dt\"],\n", + " res_basic[\"sf2\"],\n", + " marker=\".\",\n", + " )\n", + "\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", "plt.title(\"Structure Function^2\")\n", - "plt.xlabel('Time [time unit]')\n", - "plt.ylabel('SF^2 [magnitude unit^2]')\n", + "plt.xlabel(\"Time [time unit]\")\n", + "plt.ylabel(\"SF^2 [magnitude unit^2]\")\n", "plt.show()" ] }, @@ -170,10 +171,13 @@ "metadata": {}, "outputs": [], "source": [ - "id_ens, t_ens, y_ens, yerr_ens, filter_ens\\\n", - " = np.array([]), np.array([]), np.array([]), np.array([]), np.array([])\n", - "\n", - "\n", + "id_ens, t_ens, y_ens, yerr_ens, filter_ens = (\n", + " np.array([]),\n", + " np.array([]),\n", + " np.array([]),\n", + " np.array([]),\n", + " np.array([]),\n", + ")\n", "\n", "\n", "for j in range(num_light_curves):\n", @@ -181,7 +185,7 @@ " t_ens = np.append(t_ens, t[j])\n", " y_ens = np.append(y_ens, y[j])\n", " yerr_ens = np.append(yerr_ens, err[j])\n", - " filter_ens = np.append(filter_ens, np.full(num_observations, 'r'))\n" + " filter_ens = np.append(filter_ens, np.full(num_observations, \"r\"))" ] }, { @@ -190,15 +194,14 @@ "metadata": {}, "outputs": [], "source": [ - "id_ens, t_ens, y_ens, yerr_ens, filter_ens\\\n", - " = [], [], [], [], []\n", + "id_ens, t_ens, y_ens, yerr_ens, filter_ens = [], [], [], [], []\n", "\n", "for j in range(num_light_curves):\n", " id_ens.append(np.full(num_observations, j))\n", " t_ens.append(t[j])\n", " y_ens.append(y[j])\n", - " yerr_ens.append( err[j])\n", - " filter_ens.append(np.full(num_observations, 'r'))\n", + " yerr_ens.append(err[j])\n", + " filter_ens.append(np.full(num_observations, \"r\"))\n", "\n", "id_ens = np.array(id_ens)\n", "t_ens = np.array(t_ens)\n", @@ -214,19 +217,18 @@ "outputs": [], "source": [ "# First, we create all the columns that we will want to fill\n", - "# In addition to time, measurement and errors, this includes \n", + "# In addition to time, measurement and errors, this includes\n", "# id for each lightcurve and information about the filter used\n", "# For the purpose of this tutorial let's assume we are observing in `r` filter\n", "\n", - "id_ens, t_ens, y_ens, yerr_ens, filter_ens\\\n", - " = [], [], [], [], []\n", + "id_ens, t_ens, y_ens, yerr_ens, filter_ens = [], [], [], [], []\n", "\n", "for j in range(num_light_curves):\n", " id_ens.append(np.full(num_observations, j))\n", " t_ens.append(t[j])\n", " y_ens.append(y[j])\n", - " yerr_ens.append( err[j])\n", - " filter_ens.append(np.full(num_observations, 'r'))\n", + " yerr_ens.append(err[j])\n", + " filter_ens.append(np.full(num_observations, \"r\"))\n", "\n", "id_ens = np.concatenate(id_ens)\n", "t_ens = np.concatenate(t_ens)\n", @@ -234,20 +236,20 @@ "yerr_ens = np.concatenate(yerr_ens)\n", "filter_ens = np.concatenate(filter_ens)\n", "\n", - "# This line makes sure that ids are integers \n", + "# This line makes sure that ids are integers\n", "id_ens = id_ens.astype(int)\n", "\n", "# Create ColumnMapper object that will tell ensemble\n", "# meaning of each column\n", "manual_colmap = ColumnMapper().assign(\n", - " id_col=\"id_ens\", time_col=\"t_ens\", flux_col=\"y_ens\",\n", - " err_col=\"yerr_ens\", band_col=\"filter_ens\"\n", + " id_col=\"id_ens\", time_col=\"t_ens\", flux_col=\"y_ens\", err_col=\"yerr_ens\", band_col=\"filter_ens\"\n", ")\n", "\n", "ens = Ensemble()\n", - "ens.from_source_dict({'id_ens': id_ens, \"t_ens\": t_ens, 'y_ens': y_ens,\n", - " 'yerr_ens': yerr_ens, 'filter_ens': filter_ens},\n", - " column_mapper=manual_colmap)" + "ens.from_source_dict(\n", + " {\"id_ens\": id_ens, \"t_ens\": t_ens, \"y_ens\": y_ens, \"yerr_ens\": yerr_ens, \"filter_ens\": filter_ens},\n", + " column_mapper=manual_colmap,\n", + ")" ] }, { @@ -267,7 +269,7 @@ "metadata": {}, "outputs": [], "source": [ - "ens.object.head(5) \n" + "ens.object.head(5)" ] }, { @@ -276,7 +278,7 @@ "metadata": {}, "outputs": [], "source": [ - "ens.source.head(5) " + "ens.source.head(5)" ] }, { @@ -308,18 +310,18 @@ "arg_container.bin_count_target = 100000\n", "arg_container.estimate_err = True\n", "arg_container.calculation_repetitions = 100\n", - "res_sf2 = ens.sf2(argument_container = arg_container)\n", + "res_sf2 = ens.sf2(argument_container=arg_container)\n", "\n", "# We can also operate on a single lightcurve, without going via ensemble,\n", "# but with full amount of options going via argument container\n", - "j = 7 # random lightcurve \n", - "arg_container.combine = False # do not combine - only one lightcurve\n", + "j = 7 # random lightcurve\n", + "arg_container.combine = False # do not combine - only one lightcurve\n", "# only one lightcurve instead of 100 - specify 100 smaller count to get similar time-bin size as in\n", "# the full ensemble case\n", - "arg_container.bin_count_target = 1000 \n", + "arg_container.bin_count_target = 1000\n", "\n", "\n", - "res_one = tape.analysis.calc_sf2(t[j], y[j], err[j], argument_container = arg_container)\n", + "res_one = tape.analysis.calc_sf2(t[j], y[j], err[j], argument_container=arg_container)\n", "\n", "# create error array here\n", "error_array = np.zeros(len(y[j]))" @@ -332,8 +334,8 @@ "outputs": [], "source": [ "# To compare with empirical results we create theoretical structure function of the underlying model\n", - "t_th = np.arange(0,10000,1)\n", - "SF_th = (amp**2 * 2) * (1-np.exp(-t_th/tau))" + "t_th = np.arange(0, 10000, 1)\n", + "SF_th = (amp**2 * 2) * (1 - np.exp(-t_th / tau))" ] }, { @@ -361,19 +363,32 @@ "outputs": [], "source": [ "plt.figure()\n", - "plt.errorbar(res_sf2['dt'], res_sf2['sf2'], yerr=res_sf2['1_sigma'],\n", - " marker='.', capsize=2, label='ensemble result', color='red')\n", - "plt.errorbar(res_one['dt'], res_one['sf2'], yerr=res_one['1_sigma'],\n", - " marker='.', capsize=2, label='single lightcurve result', color='orange')\n", - "plt.plot(t_th, SF_th, marker='', label='theoretical result', \n", - " color='black', lw=2)\n", - "plt.yscale('log')\n", - "plt.xscale('log')\n", + "plt.errorbar(\n", + " res_sf2[\"dt\"],\n", + " res_sf2[\"sf2\"],\n", + " yerr=res_sf2[\"1_sigma\"],\n", + " marker=\".\",\n", + " capsize=2,\n", + " label=\"ensemble result\",\n", + " color=\"red\",\n", + ")\n", + "plt.errorbar(\n", + " res_one[\"dt\"],\n", + " res_one[\"sf2\"],\n", + " yerr=res_one[\"1_sigma\"],\n", + " marker=\".\",\n", + " capsize=2,\n", + " label=\"single lightcurve result\",\n", + " color=\"orange\",\n", + ")\n", + "plt.plot(t_th, SF_th, marker=\"\", label=\"theoretical result\", color=\"black\", lw=2)\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", "plt.title(\"Ensemble and single structure function\")\n", - "plt.xlabel('Time [time unit]')\n", - "plt.ylabel('SF^2 [magnitude unit^2]')\n", - "plt.ylim(10**(-2), 10**(-0.4))\n", - "plt.xlim(10**(1), 10**(3.8))\n", + "plt.xlabel(\"Time [time unit]\")\n", + "plt.ylabel(\"SF^2 [magnitude unit^2]\")\n", + "plt.ylim(10 ** (-2), 10 ** (-0.4))\n", + "plt.xlim(10 ** (1), 10 ** (3.8))\n", "plt.legend()\n", "plt.show()" ] @@ -395,12 +410,13 @@ "source": [ "# For the same lightcurve as above, let us resample it 100 times and put the results in separate array\n", "arg_container.estimate_err = False\n", - "res_resample_arr = np.zeros((100,20))\n", - "for i in range(100): \n", - " resample_arr = np.sort(np.random.choice(np.arange(0,200), size=len(y[j]), replace=True))\n", - " res_resample = tape.analysis.calc_sf2(t[j][resample_arr], y[j][resample_arr], err[j][resample_arr],\n", - " argument_container=arg_container)\n", - " res_resample_arr[i]=res_resample['sf2'].values" + "res_resample_arr = np.zeros((100, 20))\n", + "for i in range(100):\n", + " resample_arr = np.sort(np.random.choice(np.arange(0, 200), size=len(y[j]), replace=True))\n", + " res_resample = tape.analysis.calc_sf2(\n", + " t[j][resample_arr], y[j][resample_arr], err[j][resample_arr], argument_container=arg_container\n", + " )\n", + " res_resample_arr[i] = res_resample[\"sf2\"].values" ] }, { @@ -410,18 +426,24 @@ "outputs": [], "source": [ "# Show all of the 100 results in faint yellow\n", - "plt.plot(res_one['dt'], res_resample_arr.transpose(), alpha=0.3, color='yellow')\n", - "plt.errorbar(res_one['dt'], res_one['sf2'], yerr=res_one['1_sigma'],\n", - " marker='.', capsize=2, label='single lightcurve result', color='orange')\n", - "plt.plot(t_th, SF_th, marker='', label='theoretical result', \n", - " color='black', lw=2 )\n", - "plt.yscale('log')\n", - "plt.xscale('log')\n", + "plt.plot(res_one[\"dt\"], res_resample_arr.transpose(), alpha=0.3, color=\"yellow\")\n", + "plt.errorbar(\n", + " res_one[\"dt\"],\n", + " res_one[\"sf2\"],\n", + " yerr=res_one[\"1_sigma\"],\n", + " marker=\".\",\n", + " capsize=2,\n", + " label=\"single lightcurve result\",\n", + " color=\"orange\",\n", + ")\n", + "plt.plot(t_th, SF_th, marker=\"\", label=\"theoretical result\", color=\"black\", lw=2)\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", "plt.title(\"Ensemble and single structure function\")\n", - "plt.xlabel('Time [time unit]')\n", - "plt.ylabel('SF^2 [magnitude unit^2]')\n", - "plt.ylim(10**(-2), 10**(-0.4))\n", - "plt.xlim(10**(1), 10**(3.8))\n", + "plt.xlabel(\"Time [time unit]\")\n", + "plt.ylabel(\"SF^2 [magnitude unit^2]\")\n", + "plt.ylim(10 ** (-2), 10 ** (-0.4))\n", + "plt.xlim(10 ** (1), 10 ** (3.8))\n", "plt.legend()\n", "plt.show()" ] @@ -445,15 +467,14 @@ "outputs": [], "source": [ "plt.hist(res_resample_arr.transpose()[0])\n", - "err_manual = (np.quantile(res_resample_arr.transpose()[0], 0.84) -\n", - " np.quantile(res_resample_arr.transpose()[0], 0.16)) /2 \n", - "plt.axvline(res_one['sf2'][0], color='orange', label='SF value')\n", - "plt.axvline(res_one['sf2'][0]+res_one['1_sigma'][0], color='orange', ls='--',\n", - " label='reported error')\n", - "plt.axvline(res_one['sf2'][0]-res_one['1_sigma'][0], color='orange', ls='--')\n", - "plt.axvline(res_one['sf2'][0]+err_manual, color='orange', ls=':',\n", - " label='manually computed error')\n", - "plt.axvline(res_one['sf2'][0]-err_manual, color='orange', ls=':')\n", + "err_manual = (\n", + " np.quantile(res_resample_arr.transpose()[0], 0.84) - np.quantile(res_resample_arr.transpose()[0], 0.16)\n", + ") / 2\n", + "plt.axvline(res_one[\"sf2\"][0], color=\"orange\", label=\"SF value\")\n", + "plt.axvline(res_one[\"sf2\"][0] + res_one[\"1_sigma\"][0], color=\"orange\", ls=\"--\", label=\"reported error\")\n", + "plt.axvline(res_one[\"sf2\"][0] - res_one[\"1_sigma\"][0], color=\"orange\", ls=\"--\")\n", + "plt.axvline(res_one[\"sf2\"][0] + err_manual, color=\"orange\", ls=\":\", label=\"manually computed error\")\n", + "plt.axvline(res_one[\"sf2\"][0] - err_manual, color=\"orange\", ls=\":\")\n", "plt.legend()" ] }, @@ -483,25 +504,25 @@ "metadata": {}, "outputs": [], "source": [ - "# This takes at the moment 3 minutes, 25 seconds \n", + "# This takes at the moment 3 minutes, 25 seconds\n", "arg_container = tape.analysis.structure_function.StructureFunctionArgumentContainer()\n", "\n", "\n", - "arg_container.sf_method = 'macleod_2012'\n", + "arg_container.sf_method = \"macleod_2012\"\n", "arg_container.combine = True\n", "arg_container.bin_count_target = 100000\n", "arg_container.calculation_repetitions = 1\n", - "res_macleod = ens.sf2( argument_container=arg_container)\n", + "res_macleod = ens.sf2(argument_container=arg_container)\n", "\n", "\n", - "arg_container.sf_method = 'bauer_2009a'\n", + "arg_container.sf_method = \"bauer_2009a\"\n", "res_bauer_a = ens.sf2(argument_container=arg_container)\n", "\n", - "arg_container.sf_method = 'bauer_2009b'\n", + "arg_container.sf_method = \"bauer_2009b\"\n", "res_bauer_b = ens.sf2(argument_container=arg_container)\n", "\n", - "arg_container.sf_method = 'schmidt_2010'\n", - "res_schmidt = ens.sf2(argument_container=arg_container)\n" + "arg_container.sf_method = \"schmidt_2010\"\n", + "res_schmidt = ens.sf2(argument_container=arg_container)" ] }, { @@ -511,42 +532,20 @@ "outputs": [], "source": [ "# plt.plot(res_basic['dt'], res_basic['sf2'], 'b', label='Basic', lw = 3, marker = 'o')\n", - "plt.plot(res_macleod['dt'], res_macleod['sf2'], 'g',marker='.', label='MacLeod 2012')\n", - "plt.plot(res_bauer_a['dt'], res_bauer_a['sf2'], 'magenta',marker='.', label='Bauer 2009a')\n", - "plt.plot(res_bauer_b['dt'], res_bauer_b['sf2'], 'brown',marker='.', label='Bauer 2009b')\n", - "plt.plot(res_schmidt['dt'], res_schmidt['sf2'], 'm',marker='.', label='Schmidt 2010')\n", - "plt.plot(t_th, SF_th, marker='', label='theoretical result', \n", - " color='black', lw=2 )\n", + "plt.plot(res_macleod[\"dt\"], res_macleod[\"sf2\"], \"g\", marker=\".\", label=\"MacLeod 2012\")\n", + "plt.plot(res_bauer_a[\"dt\"], res_bauer_a[\"sf2\"], \"magenta\", marker=\".\", label=\"Bauer 2009a\")\n", + "plt.plot(res_bauer_b[\"dt\"], res_bauer_b[\"sf2\"], \"brown\", marker=\".\", label=\"Bauer 2009b\")\n", + "plt.plot(res_schmidt[\"dt\"], res_schmidt[\"sf2\"], \"m\", marker=\".\", label=\"Schmidt 2010\")\n", + "plt.plot(t_th, SF_th, marker=\"\", label=\"theoretical result\", color=\"black\", lw=2)\n", "plt.legend()\n", - "plt.yscale('log')\n", - "plt.xscale('log')\n", - "plt.xlabel('Time [time unit]')\n", - "plt.ylabel('SF^2 [magnitude unit^2]')\n", - "plt.ylim(10**(-2), 10**(-0.4))\n", - "plt.xlim(10**(1), 10**(3.8))\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"Time [time unit]\")\n", + "plt.ylabel(\"SF^2 [magnitude unit^2]\")\n", + "plt.ylim(10 ** (-2), 10 ** (-0.4))\n", + "plt.xlim(10 ** (1), 10 ** (3.8))\n", "plt.legend()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/tutorials/tape_datasets.ipynb b/docs/tutorials/tape_datasets.ipynb index ddcec2de..0d1b397f 100644 --- a/docs/tutorials/tape_datasets.ipynb +++ b/docs/tutorials/tape_datasets.ipynb @@ -41,27 +41,14 @@ "ens = Ensemble() # initialize an ensemble object\n", "\n", "# A ColumnMapper is created to map columns of the parquet file to timeseries quantities, such as flux, time, etc.\n", - "col_map = ColumnMapper(id_col=\"ps1_objid\",\n", - " time_col=\"midPointTai\",\n", - " flux_col=\"psFlux\",\n", - " err_col=\"psFluxErr\",\n", - " band_col=\"filterName\")\n", + "col_map = ColumnMapper(\n", + " id_col=\"ps1_objid\", time_col=\"midPointTai\", flux_col=\"psFlux\", err_col=\"psFluxErr\", band_col=\"filterName\"\n", + ")\n", "\n", "# Read in data from a parquet file that contains source (timeseries) data\n", - "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\",\n", - " column_mapper=col_map\n", - " )\n", + "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\", column_mapper=col_map)\n", "\n", - "ens.source.head(5) # View the first 5 entries of the source table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ens.client.close() # Tear down the ensemble client" + "ens.source.head(5) # View the first 5 entries of the source table" ] }, { @@ -80,7 +67,8 @@ "source": [ "ens = Ensemble() # initialize an ensemble object\n", "\n", - "col_map = ColumnMapper(id_col=\"ps1_objid\",\n", + "col_map = ColumnMapper(\n", + " id_col=\"ps1_objid\",\n", " time_col=\"midPointTai\",\n", " flux_col=\"psFlux\",\n", " err_col=\"psFluxErr\",\n", @@ -88,21 +76,13 @@ ")\n", "\n", "# Read in data from a parquet file that contains source (timeseries) data\n", - "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\",\n", - " object_file=f\"{rel_path}/object/test_object.parquet\",\n", - " column_mapper=col_map\n", - " )\n", + "ens.from_parquet(\n", + " source_file=f\"{rel_path}/source/test_source.parquet\",\n", + " object_file=f\"{rel_path}/object/test_object.parquet\",\n", + " column_mapper=col_map,\n", + ")\n", "\n", - "ens.object.head(5) # View the first 5 entries of the object table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ens.client.close() # Tear down the ensemble client" + "ens.object.head(5) # View the first 5 entries of the object table" ] }, { @@ -120,6 +100,7 @@ "outputs": [], "source": [ "from pyarrow import parquet\n", + "\n", "parquet.read_schema(f\"{rel_path}/source/test_source.parquet\", memory_map=True)" ] }, @@ -171,15 +152,6 @@ "ens.object.head(5)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ens.client.close() # Tear down the ensemble client" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -199,23 +171,26 @@ "outputs": [], "source": [ "import numpy as np\n", + "\n", "np.random.seed(1)\n", "\n", "# initialize a dictionary of empty arrays\n", - "source_dict = {\"id\": np.array([]),\n", - " \"time\": np.array([]),\n", - " \"flux\": np.array([]),\n", - " \"error\": np.array([]),\n", - " \"band\": np.array([])}\n", + "source_dict = {\n", + " \"id\": np.array([]),\n", + " \"time\": np.array([]),\n", + " \"flux\": np.array([]),\n", + " \"error\": np.array([]),\n", + " \"band\": np.array([]),\n", + "}\n", "\n", "# Create 10 lightcurves with 100 measurements each\n", "lc_len = 100\n", "for i in range(10):\n", - " source_dict[\"id\"] = np.append(source_dict[\"id\"], np.array([i]*lc_len)).astype(int)\n", + " source_dict[\"id\"] = np.append(source_dict[\"id\"], np.array([i] * lc_len)).astype(int)\n", " source_dict[\"time\"] = np.append(source_dict[\"time\"], np.linspace(1, lc_len, lc_len))\n", " source_dict[\"flux\"] = np.append(source_dict[\"flux\"], 100 + 50 * np.random.rand(lc_len))\n", " source_dict[\"error\"] = np.append(source_dict[\"error\"], 10 + 5 * np.random.rand(lc_len))\n", - " source_dict[\"band\"] = np.append(source_dict[\"band\"], [\"g\"]*50+[\"r\"]*50)" + " source_dict[\"band\"] = np.append(source_dict[\"band\"], [\"g\"] * 50 + [\"r\"] * 50)" ] }, { @@ -232,25 +207,11 @@ "metadata": {}, "outputs": [], "source": [ - "colmap = ColumnMapper(id_col=\"id\",\n", - " time_col=\"time\",\n", - " flux_col=\"flux\",\n", - " err_col=\"error\",\n", - " band_col=\"band\")\n", + "colmap = ColumnMapper(id_col=\"id\", time_col=\"time\", flux_col=\"flux\", err_col=\"error\", band_col=\"band\")\n", "ens = Ensemble()\n", "ens.from_source_dict(source_dict, column_mapper=colmap)\n", "\n", - "ens.info()\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ens.client.close() # Tear down the ensemble client" + "ens.info()" ] } ], @@ -270,7 +231,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.11" }, "vscode": { "interpreter": { diff --git a/docs/tutorials/working_with_structure_function.ipynb b/docs/tutorials/working_with_structure_function.ipynb index 0e8602d0..36c8411f 100644 --- a/docs/tutorials/working_with_structure_function.ipynb +++ b/docs/tutorials/working_with_structure_function.ipynb @@ -55,9 +55,10 @@ "outputs": [], "source": [ "from tape.analysis.structure_function.calculator_registrar import SF_METHODS\n", + "\n", "print(SF_METHODS.keys())\n", "\n", - "help(SF_METHODS['bauer_2009b'])" + "help(SF_METHODS[\"bauer_2009b\"])" ] }, { @@ -238,12 +239,14 @@ "ens = Ensemble() # initialize an ensemble object\n", "\n", "# Read in data from a parquet file\n", - "ens.from_parquet(\"../../tests/tape_tests/data/source/test_source.parquet\",\n", - " id_col='ps1_objid',\n", - " time_col='midPointTai',\n", - " flux_col='psFlux',\n", - " err_col='psFluxErr',\n", - " band_col='filterName')" + "ens.from_parquet(\n", + " \"../../tests/tape_tests/data/source/test_source.parquet\",\n", + " id_col=\"ps1_objid\",\n", + " time_col=\"midPointTai\",\n", + " flux_col=\"psFlux\",\n", + " err_col=\"psFluxErr\",\n", + " band_col=\"filterName\",\n", + ")" ] }, { @@ -275,15 +278,6 @@ "res" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ens.client.close()" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -302,7 +296,7 @@ "outputs": [], "source": [ "# Pull out a TimeSeries object from the previous Ensemble\n", - "ts = ens.to_timeseries(88472935274829959) # provided a target object id\n", + "ts = ens.to_timeseries(88472935274829959) # provided a target object id\n", "\n", "# Calculate the Structure Function for this lightcurve.\n", "res = ts.sf2()\n", @@ -358,6 +352,7 @@ "# Creating a new subclass of `StructureFunctionCalculator` to implement a new SF calculator method\n", "from tape.analysis.structure_function.base_calculator import StructureFunctionCalculator\n", "\n", + "\n", "class ExperimentalStructureFunctionCalculator(StructureFunctionCalculator):\n", " def calculate(self):\n", " # self._calculate_binned_statistic is provided by the parent class,\n", @@ -437,6 +432,7 @@ "source": [ "from dataclasses import dataclass\n", "\n", + "\n", "@dataclass\n", "class HitchhikerStructureFunctionArgumentContainer(StructureFunctionArgumentContainer):\n", " # note, due to the way dataclass inheritance works, all parameters must have a default value!\n", @@ -459,6 +455,7 @@ "source": [ "from tape.analysis.structure_function.base_calculator import StructureFunctionCalculator\n", "\n", + "\n", "class HitchhikerStructureFunctionCalculator(StructureFunctionCalculator):\n", " def calculate(self):\n", " # self._calculate_binned_statistic is provided by the parent class,\n", @@ -482,6 +479,7 @@ " def expected_argument_container() -> type:\n", " return HitchhikerStructureFunctionArgumentContainer\n", "\n", + "\n", "# register the new subclass and make sure we see \"hitchhiker\" in our list\n", "update_sf_subclasses()\n", "print(SF_METHODS.keys())" @@ -613,7 +611,7 @@ "# Notes:\n", "# - Only pair-wise differences where time_1 - time_2 > 0 are retained.'\n", "# - These are also considered 'protected' variables.\n", - "expected_number_of_differences = len(sf_lightcurve._times) * (len(sf_lightcurve._times)-1) / 2\n", + "expected_number_of_differences = len(sf_lightcurve._times) * (len(sf_lightcurve._times) - 1) / 2\n", "print(f\"\\nExpected number of differences (N * N-1)/2: {expected_number_of_differences}\")\n", "print(f\"Number of differences found: {len(sf_lightcurve._all_d_times)}\\n\")\n", "print(f\"All time differences:\\n{sf_lightcurve._all_d_times}\")\n", @@ -672,7 +670,7 @@ "print(\"\\nWe expect 12 different values now\")\n", "print(f\"All time differences:\\n{sf_lightcurve.sample_d_times}\")\n", "print(f\"All flux differences:\\n{sf_lightcurve.sample_d_fluxes}\")\n", - "print(f\"All summed errors:\\n{sf_lightcurve.sample_sum_squared_error}\")\n" + "print(f\"All summed errors:\\n{sf_lightcurve.sample_sum_squared_error}\")" ] }, { @@ -706,7 +704,7 @@ " # and perform a simple operation on them, similar to what occurs in\n", " # `StructureFunctionLightCurve._calculate_differences`, except that we\n", " # do not square the error values.\n", - " values_to_be_binned = [self.error_sum(lc._times,lc._errors) for lc in self._lightcurves]\n", + " values_to_be_binned = [self.error_sum(lc._times, lc._errors) for lc in self._lightcurves]\n", " _, mean_err_per_bin = self._calculate_binned_statistics(sample_values=values_to_be_binned)\n", "\n", " # perform some calculation using the time-binned data\n", @@ -730,6 +728,7 @@ " def expected_argument_container() -> type:\n", " return StructureFunctionArgumentContainer\n", "\n", + "\n", "# register the new subclass\n", "update_sf_subclasses()" ] @@ -766,7 +765,7 @@ ], "metadata": { "kernelspec": { - "display_name": "py310", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -780,11 +779,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.11" }, "vscode": { "interpreter": { - "hash": "08968836a6367873274ed1d5e98a07391f42fc3a62bd5aba54afbd7b11ba8673" + "hash": "83afbb17b435d9bf8b0d0042367da76f26510da1c5781f0ff6e6c518eab621ec" } } }, diff --git a/docs/tutorials/working_with_the_ensemble.ipynb b/docs/tutorials/working_with_the_ensemble.ipynb index a7863764..46f52308 100644 --- a/docs/tutorials/working_with_the_ensemble.ipynb +++ b/docs/tutorials/working_with_the_ensemble.ipynb @@ -920,24 +920,10 @@ "metadata": {}, "outputs": [], "source": [ - "new_ens = Ensemble(client=ens.client) # use the same client\n", + "new_ens = Ensemble()\n", "new_ens.from_ensemble(\"./ensemble\", additional_frames=True)\n", "new_ens.select_frame(\"result_3\").head(5)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2023-08-30T14:58:37.764841Z", - "start_time": "2023-08-30T14:58:37.539014Z" - } - }, - "outputs": [], - "source": [ - "ens.client.close() # Tear down the ensemble client" - ] } ], "metadata": { diff --git a/docs/tutorials/working_with_the_timeseries.ipynb b/docs/tutorials/working_with_the_timeseries.ipynb index 07d1eb6e..09c5f2b0 100644 --- a/docs/tutorials/working_with_the_timeseries.ipynb +++ b/docs/tutorials/working_with_the_timeseries.ipynb @@ -33,12 +33,14 @@ "ens = Ensemble() # initialize an ensemble object\n", "\n", "# Read in data from a parquet file\n", - "ens.from_parquet(\"../../tests/tape_tests/data/source/test_source.parquet\",\n", - " id_col='ps1_objid',\n", - " time_col='midPointTai',\n", - " flux_col='psFlux',\n", - " err_col='psFluxErr',\n", - " band_col='filterName')\n" + "ens.from_parquet(\n", + " \"../../tests/tape_tests/data/source/test_source.parquet\",\n", + " id_col=\"ps1_objid\",\n", + " time_col=\"midPointTai\",\n", + " flux_col=\"psFlux\",\n", + " err_col=\"psFluxErr\",\n", + " band_col=\"filterName\",\n", + ")" ] }, { @@ -47,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "ts = ens.to_timeseries(88472935274829959) # provided a target object id\n", + "ts = ens.to_timeseries(88472935274829959) # provided a target object id\n", "ts.data" ] }, @@ -67,18 +69,9 @@ "source": [ "import matplotlib.pyplot as plt\n", "\n", - "ts_r = ts.data[ts.band=='r']\n", + "ts_r = ts.data[ts.band == \"r\"]\n", "\n", - "plt.plot(ts_r['midPointTai'], ts_r['psFlux'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ens.client.close()" + "plt.plot(ts_r[\"midPointTai\"], ts_r[\"psFlux\"])" ] } ], diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 848ab4cb..047fd6b7 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -41,7 +41,7 @@ class Ensemble: """Ensemble object is a collection of light curve ids""" - def __init__(self, client=True, **kwargs): + def __init__(self, client=False, **kwargs): """Constructor of an Ensemble instance. Parameters @@ -50,7 +50,7 @@ def __init__(self, client=True, **kwargs): Accepts an existing `dask.distributed.Client`, or creates one if `client=True`, passing any additional kwargs to a dask.distributed.Client constructor call. If `client=False`, the - Ensemble is created without a distributed client. + Ensemble is created without a distributed client (default). """ self.result = None # holds the latest query