From 2befe87c0820a55c388a6955aa2f4688de065182 Mon Sep 17 00:00:00 2001 From: Lilian Date: Thu, 8 Jun 2023 17:43:00 +0200 Subject: [PATCH 1/3] Add `overload_job_titles` parameter to `fetch_employee_salaries` --- skrub/datasets/_fetching.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/skrub/datasets/_fetching.py b/skrub/datasets/_fetching.py index d2d96f0cd..1659baf0b 100644 --- a/skrub/datasets/_fetching.py +++ b/skrub/datasets/_fetching.py @@ -638,6 +638,7 @@ def fetch_employee_salaries( load_dataframe: bool = True, drop_linked: bool = True, drop_irrelevant: bool = True, + overload_job_titles: bool = True, directory: Optional[Union[Path, str]] = None, ) -> Union[DatasetAll, DatasetInfoOnly]: """Fetches the employee salaries dataset (regression), available at https://openml.org/d/42125 @@ -657,6 +658,11 @@ def fetch_employee_salaries( Drops column "full_name", which is usually irrelevant to the statistical analysis. + overload_job_titles : bool, default=True + Uses the column `underfilled_job_title` to enrich the + `employee_position_title` column, as it contains more detailed + information about the job title. + Returns ------- :obj:`DatasetAll` @@ -685,6 +691,13 @@ def fetch_employee_salaries( ) if drop_irrelevant: dataset.X.drop(["full_name"], axis=1, inplace=True) + if overload_job_titles: + dataset.X["employee_position_title"] = dataset.X[ + "underfilled_job_title" + ].fillna(dataset.X["employee_position_title"]) + dataset.X.drop( + labels=["underfilled_job_title"], axis="columns", inplace=True + ) return dataset From 9c99531f5230a9aacb6bc2f02b80241fa5dfe3cb Mon Sep 17 00:00:00 2001 From: Lilian Date: Mon, 24 Jul 2023 14:22:06 +0200 Subject: [PATCH 2/3] Add changelog entry --- CHANGES.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 42db0d0ad..4c2da3902 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -10,7 +10,7 @@ Ongoing development ===================== Skrub has not been released yet. It is currently undergoing fast -development and backward compatability is not ensured. +development and backward compatibility is not ensured. Major changes ------------- @@ -102,6 +102,12 @@ Minor changes * :class:`TableVectorizer` doesn't fail anymore if an infered type doesn't work during transform. The new entries not matching the type are replaced by missing values. :pr:`666` by :user:`Leo Grinsztajn ` +- Dataset fetcher :func:`fetch_employee_salaries` now has a parameter + `overload_job_titles` to allow overloading the job titles + (`employee_position_title`) with the column `underfilled_job_title`, + which provides some more information about the job title. + :pr:`581` by :user:`Lilian Boulard ` + Before skrub: dirty_cat ======================== From a50294169bebcdd5a318a3a3f9b7475c2a74d65b Mon Sep 17 00:00:00 2001 From: Lilian Date: Mon, 24 Jul 2023 15:05:51 +0200 Subject: [PATCH 3/3] Fix path --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 4c2da3902..f80608051 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -102,7 +102,7 @@ Minor changes * :class:`TableVectorizer` doesn't fail anymore if an infered type doesn't work during transform. The new entries not matching the type are replaced by missing values. :pr:`666` by :user:`Leo Grinsztajn ` -- Dataset fetcher :func:`fetch_employee_salaries` now has a parameter +- Dataset fetcher :func:`datasets.fetch_employee_salaries` now has a parameter `overload_job_titles` to allow overloading the job titles (`employee_position_title`) with the column `underfilled_job_title`, which provides some more information about the job title.