Skip to content

Commit

Permalink
Simplify filtering of duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
steffenkleinle committed Jan 31, 2022
1 parent 59623cb commit 822f0d6
Showing 1 changed file with 26 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,63 +8,49 @@ import org.slf4j.Logger
class FilterDuplicates(private val logger: Logger) : PipelineStep<List<AcceptingStore>, List<AcceptingStore>>() {

override fun execute(input: List<AcceptingStore>): List<AcceptingStore> {
// Group by name + postal code + street to detect duplicates
val groups = input.groupBy {
(it.name + it.postalCode + it.street).toLowerCase().filter { char -> char.isLetterOrDigit() }
}

val nonDuplicatedStores = groups.filter { it.value.size == 1 }.values.flatten()
val duplicatedStores = groups.filter { it.value.size > 1 }.map { it.value }
val deduplicatedStores = duplicatedStores.map { it.deduplicate() }.flatten()

return nonDuplicatedStores + deduplicatedStores
return groups.values.map { it.deduplicate() }
}

private fun List<AcceptingStore>.deduplicate(): List<AcceptingStore> {
val houseNumbers = mapNotNull { it.houseNumber }.toSet()

if (houseNumbers.size > 1) {
// If there are multiple valid (non-null) house numbers there are probably multiple stores
// Deduplicate for each separately
return houseNumbers.map { houseNumber -> filter { it.houseNumber == houseNumber || it.houseNumber == null } }
.map { if (it.size > 1) it.deduplicate() else it }
.flatten()
}
private fun List<AcceptingStore>.deduplicate(): AcceptingStore {
if (size == 1) return first() // No duplicates, nothing to do

// Use the last store as that is perhaps the last updated/created one
// Use the last as that is perhaps the last updated/created one
val store = last()
val houseNumber = mapNotNull { it.houseNumber }.lastOrNull()
val website = mapNotNull { it.website }.lastOrNull()
val email = mapNotNull { it.email }.lastOrNull()
val telephone = mapNotNull { it.telephone }.lastOrNull()
val additionalAddressInformation = mapNotNull { it.additionalAddressInformation }.lastOrNull()

// The coordinates are often just cut after some digits
// The coordinates are often just cut after some digits so use the one with the best precision
val longitude = mapNotNull { it.longitude }.maxBy { it.toString().length }
val latitude = mapNotNull { it.latitude }.maxBy { it.toString().length }

// Combine all descriptions because we have no way of knowing which is the correct one
val discounts = mapNotNull { it.discount }.toSet().joinToString("\n")

val houseNumber = mapNotNull { it.houseNumber }.lastOrNull()
val website = mapNotNull { it.website }.lastOrNull()
val email = mapNotNull { it.email }.lastOrNull()
val telephone = mapNotNull { it.telephone }.lastOrNull()
val additionalAddressInformation = mapNotNull { it.additionalAddressInformation }.lastOrNull()

logger.logRemoveDuplicates(store, size - 1)

return listOf(
AcceptingStore(
store.name,
store.countryCode,
store.location,
store.postalCode,
store.street,
houseNumber,
additionalAddressInformation,
longitude,
latitude,
store.categoryId,
email,
telephone,
website,
discounts
)
return AcceptingStore(
store.name,
store.countryCode,
store.location,
store.postalCode,
store.street,
houseNumber,
additionalAddressInformation,
longitude,
latitude,
store.categoryId,
email,
telephone,
website,
discounts
)
}

Expand Down

0 comments on commit 822f0d6

Please sign in to comment.