From 32b9baabac77d274d26052a013b211cbedde293d Mon Sep 17 00:00:00 2001
From: ohad <ohad@definity.ai>
Date: Tue, 10 Dec 2024 14:06:44 +0200
Subject: [PATCH 1/3] Fix pyspark docs

---
 datafu-spark/README.md                        | 15 +++++----------
 ...21-11-18-introducing-datafu-spark.markdown | 13 ++++++-------
 .../source/docs/spark/guide.html.markdown.erb | 19 ++++++++++---------
 3 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/datafu-spark/README.md b/datafu-spark/README.md
index f6700676..ad79d2f7 100644
--- a/datafu-spark/README.md
+++ b/datafu-spark/README.md
@@ -42,11 +42,9 @@ pyspark --jars datafu-spark_2.11-1.8.0.jar --conf spark.executorEnv.PYTHONPATH=d
 The following is an example of calling the Spark version of the datafu _dedup_ method
 
 ```python
-from pyspark_utils.df_utils import PySparkDFUtils
+from pyspark_utils import df_utils
 
-df_utils = PySparkDFUtils()
-
-df_people = sqlContext.createDataFrame([
+df_people = spark.createDataFrame([
      ("a", "Alice", 34),
      ("a", "Sara", 33),
      ("b", "Bob", 36),
@@ -57,12 +55,9 @@ df_people = sqlContext.createDataFrame([
      ("c", "Zoey", 36)],
      ["id", "name", "age"])
 
-func_dedup_res = df_utils.dedup_with_order(dataFrame=df_people, groupCol=df_people.id,
-                                orderCols=[df_people.age.desc(), df_people.name.desc()])
-
-func_dedup_res.registerTempTable("dedup")
-
-func_dedup_res.show()
+df_dedup = df_utils.dedup_with_order(df=df_people, group_col=df_people.id,
+                                     order_cols=[df_people.age.desc(), df_people.name.desc()])
+df_dedup.show()
 ```
 
 This should produce the following output
diff --git a/site/source/blog/2021-11-18-introducing-datafu-spark.markdown b/site/source/blog/2021-11-18-introducing-datafu-spark.markdown
index 59a6eaf8..335899a5 100644
--- a/site/source/blog/2021-11-18-introducing-datafu-spark.markdown
+++ b/site/source/blog/2021-11-18-introducing-datafu-spark.markdown
@@ -228,11 +228,9 @@ export PYTHONPATH=datafu-spark_2.11-1.6.0.jarpyspark  --jars datafu-spark_2.11-1
 The following is an example of calling the Spark version of the datafu _dedupWithOrder_ method (taken from the _datafu-spark_ [tests](https://github.com/apache/datafu/blob/master/datafu-spark/src/test/resources/python_tests/df_utils_tests.py))
 
 ```
-from pyspark_utils.df_utils import PySparkDFUtils
+from pyspark_utils import df_utils
 
-df_utils = PySparkDFUtils()
-
-df_people = sqlContext.createDataFrame([  
+df_people = spark.createDataFrame([  
 ...     ("a", "Alice", 34),  
 ...     ("a", "Sara", 33),  
 ...     ("b", "Bob", 36),  
@@ -241,9 +239,10 @@ df_people = sqlContext.createDataFrame([
 ...     ("c", "Esther", 32),  
 ...     ("c", "Fanny", 36),  
 ...     ("c", "Zoey", 36)],  
-...     ["id", "name", "age"])func_dedup_res = df_utils.dedup_with_order(dataFrame=df_people, groupCol=df_people.id,orderCols=[df_people.age.desc(), df_people.name.desc()])
-...     func_dedup_res.registerTempTable("dedup")
-...     func_dedup_res.show()
+...     ["id", "name", "age"])
+...
+...     df_dedup = df_utils.dedup_with_order(df=df_people, group_col=df_people.id, order_cols=[df_people.age.desc(), df_people.name.desc()])
+...     df_dedup.show()
 ```
 
 This should produce the following output:
diff --git a/site/source/docs/spark/guide.html.markdown.erb b/site/source/docs/spark/guide.html.markdown.erb
index 342b669a..b373eabb 100644
--- a/site/source/docs/spark/guide.html.markdown.erb
+++ b/site/source/docs/spark/guide.html.markdown.erb
@@ -43,11 +43,9 @@ pyspark --jars datafu-spark_2.12-<%= current_page.data.version %>-SNAPSHOT.jar -
 The following is an example of calling the Spark version of the datafu _dedup_ method
 
 ```python
-from pyspark_utils.df_utils import PySparkDFUtils
+from pyspark_utils import df_utils
 
-df_utils = PySparkDFUtils()
-
-df_people = sqlContext.createDataFrame([
+df_people = spark.createDataFrame([
      ("a", "Alice", 34),
      ("a", "Sara", 33),
      ("b", "Bob", 36),
@@ -58,12 +56,15 @@ df_people = sqlContext.createDataFrame([
      ("c", "Zoey", 36)],
      ["id", "name", "age"])
 
-func_dedup_res = df_utils.dedup_with_order(dataFrame=df_people, groupCol=df_people.id,
-                                orderCols=[df_people.age.desc(), df_people.name.desc()])
-
-func_dedup_res.registerTempTable("dedup")
+df_dedup = df_utils.dedup_with_order(df=df_people, group_col=df_people.id,
+                                     order_cols=[df_people.age.desc(), df_people.name.desc()])
+df_dedup.show()
 
-func_dedup_res.show()
+# or with activate()
+df_utils.activate()
+df_dedup_top_n = df_people.dedup_top_n(n=2, group_col=df_people.id,
+                                       order_cols=[df_people.age.desc(), df_people.name.desc()])
+df_dedup_top_n.show()
 ```
 
 This should produce the following output

From a9cd5b912b6fad05ea829e3d09aa4ce28c0b1ce0 Mon Sep 17 00:00:00 2001
From: ohad <ohad@definity.ai>
Date: Tue, 10 Dec 2024 15:39:47 +0200
Subject: [PATCH 2/3] revert blog

---
 .../2021-11-18-introducing-datafu-spark.markdown    | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/site/source/blog/2021-11-18-introducing-datafu-spark.markdown b/site/source/blog/2021-11-18-introducing-datafu-spark.markdown
index 335899a5..59a6eaf8 100644
--- a/site/source/blog/2021-11-18-introducing-datafu-spark.markdown
+++ b/site/source/blog/2021-11-18-introducing-datafu-spark.markdown
@@ -228,9 +228,11 @@ export PYTHONPATH=datafu-spark_2.11-1.6.0.jarpyspark  --jars datafu-spark_2.11-1
 The following is an example of calling the Spark version of the datafu _dedupWithOrder_ method (taken from the _datafu-spark_ [tests](https://github.com/apache/datafu/blob/master/datafu-spark/src/test/resources/python_tests/df_utils_tests.py))
 
 ```
-from pyspark_utils import df_utils
+from pyspark_utils.df_utils import PySparkDFUtils
 
-df_people = spark.createDataFrame([  
+df_utils = PySparkDFUtils()
+
+df_people = sqlContext.createDataFrame([  
 ...     ("a", "Alice", 34),  
 ...     ("a", "Sara", 33),  
 ...     ("b", "Bob", 36),  
@@ -239,10 +241,9 @@ df_people = spark.createDataFrame([
 ...     ("c", "Esther", 32),  
 ...     ("c", "Fanny", 36),  
 ...     ("c", "Zoey", 36)],  
-...     ["id", "name", "age"])
-...
-...     df_dedup = df_utils.dedup_with_order(df=df_people, group_col=df_people.id, order_cols=[df_people.age.desc(), df_people.name.desc()])
-...     df_dedup.show()
+...     ["id", "name", "age"])func_dedup_res = df_utils.dedup_with_order(dataFrame=df_people, groupCol=df_people.id,orderCols=[df_people.age.desc(), df_people.name.desc()])
+...     func_dedup_res.registerTempTable("dedup")
+...     func_dedup_res.show()
 ```
 
 This should produce the following output:

From bb898fc376dc5409f76b901154ab499de22ceac5 Mon Sep 17 00:00:00 2001
From: ohad <ohad@definity.ai>
Date: Tue, 10 Dec 2024 15:55:02 +0200
Subject: [PATCH 3/3] update readme version

---
 datafu-spark/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafu-spark/README.md b/datafu-spark/README.md
index ad79d2f7..152677ba 100644
--- a/datafu-spark/README.md
+++ b/datafu-spark/README.md
@@ -34,9 +34,9 @@ In order to call the datafu-spark API's from Pyspark, you can do the following (
 First, call pyspark with the following parameters
 
 ```bash
-export PYTHONPATH=datafu-spark_2.11-1.8.0.jar
+export PYTHONPATH=datafu-spark_2.12-2.0.0.jar
 
-pyspark --jars datafu-spark_2.11-1.8.0.jar --conf spark.executorEnv.PYTHONPATH=datafu-spark_2.11-1.8.0.jar
+pyspark --jars datafu-spark_2.12-2.0.0.jar --conf spark.executorEnv.PYTHONPATH=datafu-spark_2.12-2.0.0.jar
 ```
 
 The following is an example of calling the Spark version of the datafu _dedup_ method