Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make sortBy(ColumnReference) accept pathOf without extra cast #779

Merged
merged 1 commit into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ public fun <T, C> DataFrame<T>.sortBy(columns: SortColumnsSelector<T, C>): DataF
UnresolvedColumnsPolicy.Fail, columns
)

public fun <T> DataFrame<T>.sortBy(vararg cols: ColumnReference<Comparable<*>?>): DataFrame<T> =
public fun <T> DataFrame<T>.sortBy(vararg cols: ColumnReference<*>): DataFrame<T> =
sortBy { cols.toColumnSet() }

public fun <T> DataFrame<T>.sortBy(vararg cols: String): DataFrame<T> = sortBy { cols.toColumnSet() }
Expand All @@ -132,7 +132,7 @@ public fun <T, C> DataFrame<T>.sortByDesc(vararg columns: KProperty<Comparable<C

public fun <T> DataFrame<T>.sortByDesc(vararg columns: String): DataFrame<T> = sortByDesc { columns.toColumnSet() }

public fun <T, C> DataFrame<T>.sortByDesc(vararg columns: ColumnReference<Comparable<C>?>): DataFrame<T> =
public fun <T> DataFrame<T>.sortByDesc(vararg columns: ColumnReference<*>): DataFrame<T> =
sortByDesc { columns.toColumnSet() }

// endregion
Expand All @@ -141,7 +141,7 @@ public fun <T, C> DataFrame<T>.sortByDesc(vararg columns: ColumnReference<Compar

public fun <T, G> GroupBy<T, G>.sortBy(vararg cols: String): GroupBy<T, G> = sortBy { cols.toColumnSet() }

public fun <T, G> GroupBy<T, G>.sortBy(vararg cols: ColumnReference<Comparable<*>?>): GroupBy<T, G> =
public fun <T, G> GroupBy<T, G>.sortBy(vararg cols: ColumnReference<*>): GroupBy<T, G> =
sortBy { cols.toColumnSet() }

public fun <T, G> GroupBy<T, G>.sortBy(vararg cols: KProperty<Comparable<*>?>): GroupBy<T, G> =
Expand All @@ -151,7 +151,7 @@ public fun <T, G, C> GroupBy<T, G>.sortBy(selector: SortColumnsSelector<G, C>):

public fun <T, G> GroupBy<T, G>.sortByDesc(vararg cols: String): GroupBy<T, G> = sortByDesc { cols.toColumnSet() }

public fun <T, G> GroupBy<T, G>.sortByDesc(vararg cols: ColumnReference<Comparable<*>?>): GroupBy<T, G> =
public fun <T, G> GroupBy<T, G>.sortByDesc(vararg cols: ColumnReference<*>): GroupBy<T, G> =
sortByDesc { cols.toColumnSet() }

public fun <T, G> GroupBy<T, G>.sortByDesc(vararg cols: KProperty<Comparable<*>?>): GroupBy<T, G> =
Expand Down
28 changes: 28 additions & 0 deletions core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/sort.kt
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
package org.jetbrains.kotlinx.dataframe.api

import io.kotest.assertions.throwables.shouldThrowMessage
import io.kotest.matchers.shouldBe
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.io.readDataFrame
import org.jetbrains.kotlinx.dataframe.nrow
import org.jetbrains.kotlinx.dataframe.testResource
import org.jetbrains.kotlinx.dataframe.testSets.*
import org.jetbrains.kotlinx.dataframe.testSets.DsSalaries
import org.junit.Test

class SortDataColumn {
Expand Down Expand Up @@ -67,4 +72,27 @@ class SortDataColumn {
col.sortWith { df1, df2 -> df1[a] - df2[a] } shouldBe sortedCol
col.sortWith(compareBy { it[a] }) shouldBe sortedCol
}

@Test
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool new dataset for unit tests! Thanks!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great to validate on that some high-level integration tests with more complex logic instead of low-level A.a.b

fun `sort by nested column`() {
val df = testResource("ds_salaries.csv").readDataFrame().cast<DsSalaries>()
val aggregate = df.pivot(false) { companySize }.groupBy { companyLocation }.aggregate {
maxOf { salaryInUsd } into "salary"
maxBy { salaryInUsd } into "extra"
}
aggregate.sortBy(pathOf("L", "salary"))[0][pathOf("L", "salary")] shouldBe null
aggregate.sortByDesc(pathOf("L", "salary"))[0][pathOf("L", "salary")] shouldBe 600_000
}

@Test
fun `sort by invalid nested column`() {
val df = testResource("ds_salaries.csv").readDataFrame().cast<DsSalaries>()
val aggregate = df.pivot(false) { companySize }.groupBy { companyLocation }.aggregate {
maxOf { salaryInUsd } into "salary"
maxBy { salaryInUsd } into "extra"
}
shouldThrowMessage("Can not use ColumnGroup as sort column") {
aggregate.sortBy(pathOf("L", "extra"))
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package org.jetbrains.kotlinx.dataframe.testSets

import org.jetbrains.kotlinx.dataframe.annotations.ColumnName
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema

// Dataset from https://www.kaggle.com/datasets/ruchi798/data-science-job-salaries
@Suppress("unused")
@DataSchema
interface DsSalaries {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to add somewhere the link to the initial source of dataset (for example here, on that data schema), the origin, the license, the possible transformation which we did

It's a gentleman set of actions for copying of external dataset to your codebase. If we missed it earlier, let's start from that

Copy link
Collaborator Author

@koperagen koperagen Jul 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a link. It was downloaded as is, without modifications and is public domain. So, do we need to add anything else here?

@ColumnName("company_location")
val companyLocation: String
@ColumnName("company_size")
val companySize: String
@ColumnName("employee_residence")
val employeeResidence: String
@ColumnName("employment_type")
val employmentType: String
@ColumnName("experience_level")
val experienceLevel: String
@ColumnName("job_title")
val jobTitle: String
@ColumnName("remote_ratio")
val remoteRatio: Int
val salary: Int
@ColumnName("salary_currency")
val salaryCurrency: String
@ColumnName("salary_in_usd")
val salaryInUsd: Int
val untitled: Int
@ColumnName("work_year")
val workYear: Int
}
Loading
Loading