Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/develop' into matthewedwarddav…
Browse files Browse the repository at this point in the history
…idson-patch-1
  • Loading branch information
skylee03 committed Nov 22, 2023
2 parents 9f7abaa + b0a8b6b commit 62ea3ad
Show file tree
Hide file tree
Showing 14 changed files with 93 additions and 62 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ spark-warehouses/

*.DS_Store*
.clj-kondo/.cache
.clj-kondo/marick

pom.xml
pom.xml.asc
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

Geni (*/gɜni/* or "gurney" without the r) is a [Clojure](https://clojure.org/) dataframe library that runs on [Apache Spark](https://spark.apache.org/). The name means "fire" in Javanese.

[![CI](https://github.com/zero-one-group/geni/workflows/Continuous%20Integration/badge.svg?branch=develop)](https://github.com/zero-one-group/geni/actions)
[![CI](https://github.com/zero-one-group/geni/actions/workflows/continuous-integration.yml/badge.svg?branch=develop)](https://github.com/zero-one-group/geni/actions)
[![Code Coverage](https://codecov.io/gh/zero-one-group/geni/branch/develop/graph/badge.svg)](https://codecov.io/gh/zero-one-group/geni)
[![Clojars Project](https://img.shields.io/clojars/v/zero.one/geni.svg)](http://clojars.org/zero.one/geni)
[![License](https://img.shields.io/github/license/zero-one-group/geni.svg)](LICENSE)
Expand Down
44 changes: 27 additions & 17 deletions docker/project.clj
Original file line number Diff line number Diff line change
@@ -1,38 +1,48 @@
(def spark-deps
'[;; Spark
'[[io.netty/netty-all "4.1.74.Final"]
[com.fasterxml.jackson.core/jackson-core "2.15.3"]
[com.fasterxml.jackson.core/jackson-annotations "2.15.3"]
;; Spark
; This breaks cljcdoc: https://github.com/cljdoc/cljdoc/issues/407
; Frozen until issue is resolved.
;[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
[org.apache.spark/spark-avro_2.12 "3.1.1"]
[org.apache.spark/spark-core_2.12 "3.1.1"]
[org.apache.spark/spark-hive_2.12 "3.1.1"]
[org.apache.spark/spark-mllib_2.12 "3.1.1"]
[org.apache.spark/spark-sql_2.12 "3.1.1"]
[org.apache.spark/spark-streaming_2.12 "3.1.1"]
[org.apache.spark/spark-avro_2.12 "3.3.3"]
[org.apache.spark/spark-core_2.12 "3.3.3"]
[org.apache.spark/spark-hive_2.12 "3.3.3"]
[org.apache.spark/spark-mllib_2.12 "3.3.3"]
[org.apache.spark/spark-sql_2.12 "3.3.3"]
[org.apache.spark/spark-streaming_2.12 "3.3.3"]
; Arrow
[org.apache.arrow/arrow-memory-netty "3.0.0"]
[org.apache.arrow/arrow-memory-core "3.0.0"]
[org.apache.arrow/arrow-vector "3.0.0"
[org.apache.arrow/arrow-memory-netty "4.0.0"]
[org.apache.arrow/arrow-memory-core "4.0.0"]
[org.apache.arrow/arrow-vector "4.0.0"
:exclusions [commons-codec com.fasterxml.jackson.core/jackson-databind]]
; Databases
[mysql/mysql-connector-java "8.0.23"]
[org.postgresql/postgresql "42.2.19"]
[mysql/mysql-connector-java "8.0.25"]
[org.postgresql/postgresql "42.2.20"]
[org.xerial/sqlite-jdbc "3.34.0"]
;; Optional: Spark XGBoost
[ml.dmlc/xgboost4j-spark_2.12 "1.2.0"]
[ml.dmlc/xgboost4j_2.12 "1.2.0"]])

(defproject zero.one/geni "0.0.40"
:jvm-opts ["-Duser.country=US" "-Duser.language=en"]
(defproject zero.one/geni "0.0.41"
:jvm-opts ["-Duser.country=US" "-Duser.language=en"
"--add-opens=java.base/java.io=ALL-UNNAMED"
"--add-opens=java.base/java.nio=ALL-UNNAMED"
"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED"
"--add-opens=java.base/java.util=ALL-UNNAMED"
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"]
:description "A Clojure dataframe library that runs on Spark"
:url "https://github.com/zero-one-group/geni"
:license {:name "Apache License"
:url "https://www.apache.org/licenses/LICENSE-2.0"}
:dependencies [[camel-snake-kebab "0.4.2"]
[com.taoensso/nippy "3.3.0"]
[expound "0.8.9"]
[metosin/jsonista "0.3.3"
:exclusions [com.fasterxml.jackson.core/jackson-databind]]
[com.taoensso/nippy "3.1.1"]
[net.cgrand/parsley "0.9.3" :exclusions [org.clojure/clojure]]
[nrepl "0.8.3"]
[org.clojure/clojure "1.10.3"]
[org.clojure/java.data "1.0.86"]
Expand All @@ -45,8 +55,8 @@
:uberjar {:aot :all :dependencies ~spark-deps}
:dev {:dependencies [[criterium "0.4.6"]
[enlive "1.1.6"]
[midje "1.10.3"]
[techascent/tech.ml.dataset "5.21"
[midje "1.10.9"]
[techascent/tech.ml.dataset "6.101"
:exclusions [ch.qos.logback/logback-classic]]]
:plugins [[lein-ancient "0.7.0"]
[lein-cloverage "1.2.2"]
Expand Down
2 changes: 1 addition & 1 deletion examples/geni-clj-app/deps.edn
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
ml.dmlc/xgboost4j-spark_2.12 {:mvn/version "1.0.0"}
ml.dmlc/xgboost4j_2.12 {:mvn/version "1.0.0"}
;; Geni
zero.one/geni {:mvn/version "0.0.40"}}}
zero.one/geni {:mvn/version "0.0.41"}}}
2 changes: 1 addition & 1 deletion lein-template/project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject geni/lein-template "0.0.40"
(defproject geni/lein-template "0.0.41"
:description "Leiningen template for a Geni application."
:url "https://github.com/zero-one-group/geni/tree/develop/lein-template"
:license {:name "Apache License"
Expand Down
23 changes: 16 additions & 7 deletions lein-template/resources/leiningen/new/geni/project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,20 @@
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.10.3"]
[zero.one/geni "0.0.40"]
[zero.one/geni "0.0.41"]
[metosin/jsonista "0.3.3"
:exclusions [com.fasterxml.jackson.core/jackson-databind]]
[expound "0.8.9"]
[io.netty/netty-all "4.1.74.Final"]
[com.fasterxml.jackson.core/jackson-core "2.15.3"]
[com.fasterxml.jackson.core/jackson-annotations "2.15.3"]
;; Spark
[org.apache.spark/spark-core_2.12 "3.1.2"]
[org.apache.spark/spark-hive_2.12 "3.1.2"]
[org.apache.spark/spark-mllib_2.12 "3.1.2"]
[org.apache.spark/spark-sql_2.12 "3.1.2"]
[org.apache.spark/spark-streaming_2.12 "3.1.2"]
[org.apache.spark/spark-yarn_2.12 "3.1.2"]
[org.apache.spark/spark-core_2.12 "3.3.3"]
[org.apache.spark/spark-hive_2.12 "3.3.3"]
[org.apache.spark/spark-mllib_2.12 "3.3.3"]
[org.apache.spark/spark-sql_2.12 "3.3.3"]
[org.apache.spark/spark-streaming_2.12 "3.3.3"]
[org.apache.spark/spark-yarn_2.12 "3.3.3"]
[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
; Arrow
[org.apache.arrow/arrow-memory-netty "4.0.0"]
Expand All @@ -40,6 +43,12 @@
"--class"
"{{namespace}}.core"
"target/uberjar/{{raw-name}}-standalone.jar"]]}{{/dataproc?}}
:jvm-opts ["--add-opens=java.base/java.io=ALL-UNNAMED"
"--add-opens=java.base/java.nio=ALL-UNNAMED"
"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED"
"--add-opens=java.base/java.util=ALL-UNNAMED"
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"]
:profiles {:uberjar {:aot :all}
:dev {:plugins [[lein-ancient "0.7.0"]]}}
:main ^:skip-aot {{namespace}}.core
Expand Down
44 changes: 27 additions & 17 deletions project.clj
Original file line number Diff line number Diff line change
@@ -1,38 +1,48 @@
(def spark-deps
'[;; Spark
'[[io.netty/netty-all "4.1.74.Final"]
[com.fasterxml.jackson.core/jackson-core "2.15.3"]
[com.fasterxml.jackson.core/jackson-annotations "2.15.3"]
;; Spark
; This breaks cljcdoc: https://github.com/cljdoc/cljdoc/issues/407
; Frozen until issue is resolved.
;[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
[org.apache.spark/spark-avro_2.12 "3.1.1"]
[org.apache.spark/spark-core_2.12 "3.1.1"]
[org.apache.spark/spark-hive_2.12 "3.1.1"]
[org.apache.spark/spark-mllib_2.12 "3.1.1"]
[org.apache.spark/spark-sql_2.12 "3.1.1"]
[org.apache.spark/spark-streaming_2.12 "3.1.1"]
[org.apache.spark/spark-avro_2.12 "3.3.3"]
[org.apache.spark/spark-core_2.12 "3.3.3"]
[org.apache.spark/spark-hive_2.12 "3.3.3"]
[org.apache.spark/spark-mllib_2.12 "3.3.3"]
[org.apache.spark/spark-sql_2.12 "3.3.3"]
[org.apache.spark/spark-streaming_2.12 "3.3.3"]
; Arrow
[org.apache.arrow/arrow-memory-netty "3.0.0"]
[org.apache.arrow/arrow-memory-core "3.0.0"]
[org.apache.arrow/arrow-vector "3.0.0"
[org.apache.arrow/arrow-memory-netty "4.0.0"]
[org.apache.arrow/arrow-memory-core "4.0.0"]
[org.apache.arrow/arrow-vector "4.0.0"
:exclusions [commons-codec com.fasterxml.jackson.core/jackson-databind]]
; Databases
[mysql/mysql-connector-java "8.0.23"]
[org.postgresql/postgresql "42.2.19"]
[mysql/mysql-connector-java "8.0.25"]
[org.postgresql/postgresql "42.2.20"]
[org.xerial/sqlite-jdbc "3.34.0"]
;; Optional: Spark XGBoost
[ml.dmlc/xgboost4j-spark_2.12 "1.2.0"]
[ml.dmlc/xgboost4j_2.12 "1.2.0"]])

(defproject zero.one/geni "0.0.40"
:jvm-opts ["-Duser.country=US" "-Duser.language=en"]
(defproject zero.one/geni "0.0.41"
:jvm-opts ["-Duser.country=US" "-Duser.language=en"
"--add-opens=java.base/java.io=ALL-UNNAMED"
"--add-opens=java.base/java.nio=ALL-UNNAMED"
"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED"
"--add-opens=java.base/java.util=ALL-UNNAMED"
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"]
:description "A Clojure dataframe library that runs on Spark"
:url "https://github.com/zero-one-group/geni"
:license {:name "Apache License"
:url "https://www.apache.org/licenses/LICENSE-2.0"}
:dependencies [[camel-snake-kebab "0.4.2"]
[com.taoensso/nippy "3.3.0"]
[expound "0.8.9"]
[metosin/jsonista "0.3.3"
:exclusions [com.fasterxml.jackson.core/jackson-databind]]
[com.taoensso/nippy "3.1.1"]
[net.cgrand/parsley "0.9.3" :exclusions [org.clojure/clojure]]
[nrepl "0.8.3"]
[org.clojure/clojure "1.10.3"]
[org.clojure/java.data "1.0.86"]
Expand All @@ -45,8 +55,8 @@
:uberjar {:aot :all :dependencies ~spark-deps}
:dev {:dependencies [[criterium "0.4.6"]
[enlive "1.1.6"]
[midje "1.10.3"]
[techascent/tech.ml.dataset "5.21"
[midje "1.10.9"]
[techascent/tech.ml.dataset "6.101"
:exclusions [ch.qos.logback/logback-classic]]]
:plugins [[lein-ancient "0.7.0"]
[lein-cloverage "1.2.2"]
Expand Down
2 changes: 1 addition & 1 deletion resources/GENI_REPL_RELEASED_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.40
0.0.41
3 changes: 2 additions & 1 deletion src/clojure/zero_one/geni/core/functions.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns zero-one.geni.core.functions
(:refer-clojure :exclude [concat
(:refer-clojure :exclude [abs
concat
flatten
hash
map
Expand Down
6 changes: 3 additions & 3 deletions test/zero_one/geni/data_sources_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
(g/dtypes dummy-df) => {:coord "ArrayType(DoubleType,true)"
:prop "MapType(StringType,StringType,true)"
:rooms (str "StructType("
"StructField(rooms,LongType,true), "
"StructField(rooms,LongType,true),"
"StructField(bathroom,DoubleType,true))")})
(fact "correct direct schema option"
(-> (g/read-parquet!
Expand All @@ -46,7 +46,7 @@
g/dtypes) => {:coord "ArrayType(LongType,true)"
:prop "MapType(StringType,StringType,true)"
:rooms (str "StructType("
"StructField(rooms,IntegerType,true), "
"StructField(rooms,IntegerType,true),"
"StructField(bathroom,FloatType,true))")})
(fact "correct data-oriented schema option"
(-> (g/read-parquet!
Expand All @@ -57,7 +57,7 @@
g/dtypes) => {:coord "ArrayType(ShortType,true)"
:prop "MapType(StringType,StringType,true)"
:rooms (str "StructType("
"StructField(rooms,FloatType,true), "
"StructField(rooms,FloatType,true),"
"StructField(bathroom,LongType,true))")})))

(facts "On binary data" :binary
Expand Down
10 changes: 5 additions & 5 deletions test/zero_one/geni/dataset_creation_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,15 @@
[(g/row (g/row 27 42))
(g/row (g/row 57 18))]
{:coord {:x :int :y :int}}))
=> {:coord "StructType(StructField(x,IntegerType,true), StructField(y,IntegerType,true))"})
=> {:coord "StructType(StructField(x,IntegerType,true),StructField(y,IntegerType,true))"})
(fact "of struct array fields"
(g/dtypes
(g/create-dataframe
@tr/spark
[(g/row [(g/row 27 42)])
(g/row [(g/row 57 18)])]
{:coords [{:x :int :y :int}]}))
=> {:coords "ArrayType(StructType(StructField(x,IntegerType,true), StructField(y,IntegerType,true)),true)"}))
=> {:coords "ArrayType(StructType(StructField(x,IntegerType,true),StructField(y,IntegerType,true)),true)"}))

(facts "On building blocks"
(fact "can instantiate vectors"
Expand Down Expand Up @@ -266,7 +266,7 @@
(instance? Dataset dataset) => true
(g/column-names dataset) => ["a" "b"]
(g/dtypes dataset) => {:a "LongType"
:b "StructType(StructField(z,ArrayType(StringType,true),true), StructField(y,BooleanType,true))"}))
:b "StructType(StructField(z,ArrayType(StringType,true),true),StructField(y,BooleanType,true))"}))
(fact "should create the right schema for list of maps"
(let [dataset (g/table->dataset
@tr/spark
Expand All @@ -276,7 +276,7 @@
(instance? Dataset dataset) => true
(g/column-names dataset) => ["a" "b"]
(g/dtypes dataset) => {:a "LongType"
:b "ArrayType(StructType(StructField(z,LongType,true), StructField(y,DoubleType,true)),true)"}))
:b "ArrayType(StructType(StructField(z,LongType,true),StructField(y,DoubleType,true)),true)"}))
(fact "should create the right schema for list of list of maps"
(let [dataset (g/table->dataset
@tr/spark
Expand All @@ -286,7 +286,7 @@
(instance? Dataset dataset) => true
(g/column-names dataset) => ["a" "b"]
(g/dtypes dataset) => {:a "LongType"
:b "ArrayType(ArrayType(StructType(StructField(z,LongType,true), StructField(y,BooleanType,true)),true),true)"})))
:b "ArrayType(ArrayType(StructType(StructField(z,LongType,true),StructField(y,BooleanType,true)),true),true)"})))

(facts "On spark range"
(fact "should create simple datasets"
Expand Down
2 changes: 1 addition & 1 deletion test/zero_one/geni/dataset_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@
(-> (df-20)
(g/repartition :Suburb :SellerG)
g/partitions
count) => #(< 1 %))
count) => #(<= 1 %))
(fact "able to repartition by number and columns"
(-> (df-20)
(g/repartition 10 :Suburb :SellerG)
Expand Down
2 changes: 1 addition & 1 deletion test/zero_one/geni/rdd_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
(rdd/resources) => {}
(rdd/spark-home) => (System/getenv "SPARK_HOME")
(rdd/sc) => (partial instance? SparkContext)
(rdd/version) => "3.1.1"))
(rdd/version) => "3.3.3"))

(facts "On repartitioning" :rdd
(fact "partition-by works"
Expand Down
12 changes: 6 additions & 6 deletions test/zero_one/geni/sql_functions_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
:to-2 (g/to-json (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")})
{:timestampFormat "dd/MM/yyyy"})})
g/collect
first) => {:schema-1 "ARRAY<STRUCT<`col`: BIGINT>>"
:schema-2 "ARRAY<STRUCT<`col`: BIGINT>>"
first) => {:schema-1 "ARRAY<STRUCT<col: BIGINT>>"
:schema-2 "ARRAY<STRUCT<col: BIGINT>>"
:from-1 {:a 1 :b 0.8}
:from-2 {:time (Timestamp. 1440547200000)}
:to-1 "{\"a\":1,\"b\":2}"
Expand All @@ -44,8 +44,8 @@
:to-2 (g/to-csv (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")})
{:timestampFormat "dd/MM/yyyy"})})
g/collect
first) => {:schema-1 "STRUCT<`_c0`: INT, `_c1`: STRING>"
:schema-2 "STRUCT<`_c0`: INT, `_c1`: STRING>"
first) => {:schema-1 "STRUCT<_c0: INT, _c1: STRING>"
:schema-2 "STRUCT<_c0: INT, _c1: STRING>"
:from-1 {:a 1 :b 0.8}
:from-2 {:time (Timestamp. 1440547200000)}
:to-1 "1,2"
Expand Down Expand Up @@ -214,7 +214,7 @@
(-> (df-20)
(g/cube :SellerG :Regionname)
(g/agg (g/grouping-id :SellerG :Regionname))
g/first-vals) => ["Nelson" nil 1]
g/first-vals) => ["Biggin" "Northern Metropolitan" 0]
(-> (df-20)
(g/group-by :SellerG)
(g/agg (-> (g/collect-list :Regionname) (g/as :regions)))
Expand Down Expand Up @@ -503,7 +503,7 @@
(g/agg
(g/count-distinct {:seller :SellerG
:suburb :Suburb}))
g/column-names) => ["count(SellerG AS `seller`, Suburb AS `suburb`)"])))
g/column-names) => ["count(SellerG AS seller, Suburb AS suburb)"])))

(facts "On window functions" :slow
(let [window (g/window {:partition-by :SellerG :order-by :Price})]
Expand Down

0 comments on commit 62ea3ad

Please sign in to comment.