Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter duplicates #481

Merged
merged 7 commits into from
Feb 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package app.ehrenamtskarte.backend.common
package app.ehrenamtskarte.backend.stores

const val COUNTRY_CODE = "de"
const val STATE = "Bayern"

// Postal code lookup and address sanitation fails/does not really make sense for a "Postfach"
const val STREET_EXCLUDE_PATTERN = "Postfach"

const val MISCELLANEOUS_CATEGORY_ID = 9
const val ALTERNATIVE_MISCELLANEOUS_CATEGORY_ID = 99

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package app.ehrenamtskarte.backend.stores.geocoding

import app.ehrenamtskarte.backend.common.COUNTRY_CODE
import app.ehrenamtskarte.backend.common.STATE
import app.ehrenamtskarte.backend.stores.COUNTRY_CODE
import app.ehrenamtskarte.backend.stores.STATE
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import com.fasterxml.jackson.databind.ObjectMapper
import io.ktor.client.HttpClient
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ object LbeDataImporter {
.addStep(SanitizeAddress(logger), logger) { logger.info("== Sanitize address ==") }
.addStep(SanitizeGeocode(logger, httpClient), logger) { logger.info("== Sanitize data with geocoding ==") }
.addStep(PostSanitizeFilter(logger, httpClient), logger) { logger.info("== Filter sanitized data ==") }
.addStep(Encode(logger), logger) { logger.info("== Handle encoding issues ==") }
.addStep(FilterDuplicates(logger), logger) { logger.info("== Filter duplicated data ==") }
.addStep(Store(logger, manualImport), logger) { logger.info("== Store remaining data to db ==") }
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ fun Logger.logChange(storeInfo: String, property: String, oldValue: String?, new
}

fun Logger.logChange(store: AcceptingStore, property: String, oldValue: String?, newValue: String?) {
val storeInfo = listOfNotNull(store.name, store.location, store.street, store.houseNumber).joinToString()
logChange(storeInfo, property, oldValue, newValue)
logChange(storeInfo(store), property, oldValue, newValue)
}

fun Logger.logRemoveDuplicates(store: AcceptingStore, count: Int) {
info("Removed duplicates ($count) of '${storeInfo(store)}'")
}

private fun storeInfo(store: AcceptingStore): String {
return listOfNotNull(store.name, store.location, store.street, store.houseNumber).joinToString()
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package app.ehrenamtskarte.backend.stores.importer.steps

import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.logRemoveDuplicates
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import org.slf4j.Logger

class FilterDuplicates(private val logger: Logger) : PipelineStep<List<AcceptingStore>, List<AcceptingStore>>() {

override fun execute(input: List<AcceptingStore>): List<AcceptingStore> {
// Group by name + postal code + street to detect duplicates
val groups = input.groupBy {
(it.name + it.postalCode + it.street).toLowerCase().filter { char -> char.isLetterOrDigit() }
}

return groups.values.map { it.deduplicate() }
}

private fun List<AcceptingStore>.deduplicate(): AcceptingStore {
if (size == 1) return first() // No duplicates, nothing to do

// Use the last as that is perhaps the last updated/created one
val store = last()

logger.logRemoveDuplicates(store, size - 1)

val location = lastValue("locations") { it.location }
val categoryId = lastValue("categoryIds") { it.categoryId }
val houseNumber = lastValue("house numbers") { it.houseNumber }
val website = lastValue("websites") { it.website }
val email = lastValue("emails") { it.email }
val telephone = lastValue("telephones") { it.telephone }
val additionalAddressInformation = lastValue("additional address information") { it.additionalAddressInformation }

// The coordinates are often just cut after some digits so use the one with the best precision
val longitude = mapNotNull { it.longitude }.maxBy { it.toString().length }
val latitude = mapNotNull { it.latitude }.maxBy { it.toString().length }

// Combine all descriptions because we have no way of knowing which is the correct one
val discounts = mapNotNull { it.discount }.toSet().joinToString("\n")

return AcceptingStore(
store.name,
store.countryCode,
location!!,
store.postalCode,
store.street,
houseNumber,
additionalAddressInformation,
longitude,
latitude,
categoryId!!,
email,
telephone,
website,
discounts
)
}

private fun <T: Any> List<AcceptingStore>.lastValue(property: String, transform: (AcceptingStore) -> T?): T? {
val uniqueValues = mapNotNull { transform(it) }.toSet()

if (uniqueValues.size > 1) {
logger.info("$property: ${uniqueValues.joinToString("', '", "'", "'")}")
}

return uniqueValues.lastOrNull()
}

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package app.ehrenamtskarte.backend.stores.importer.steps

import app.ehrenamtskarte.backend.stores.ALTERNATIVE_MISCELLANEOUS_CATEGORY_ID
import app.ehrenamtskarte.backend.stores.MISCELLANEOUS_CATEGORY_ID
import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.matchesNa
import app.ehrenamtskarte.backend.stores.importer.types.LbeAcceptingStore
Expand Down Expand Up @@ -40,7 +42,7 @@ class FilterLbe(private val logger: Logger): PipelineStep<List<LbeAcceptingStore
}

private fun LbeAcceptingStore.isValidCategory(): Boolean {
val validCategories = (0..MISCELLANEOUS_CATEGORY) + listOf(ALTERNATIVE_MISCELLANEOUS_CATEGORY)
val validCategories = (0..MISCELLANEOUS_CATEGORY_ID) + listOf(ALTERNATIVE_MISCELLANEOUS_CATEGORY_ID)
val valid = category?.toIntOrNull() in validCategories

if (!valid)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
package app.ehrenamtskarte.backend.stores.importer.steps

import app.ehrenamtskarte.backend.common.COUNTRY_CODE
import app.ehrenamtskarte.backend.stores.ALTERNATIVE_MISCELLANEOUS_CATEGORY_ID
import app.ehrenamtskarte.backend.stores.COUNTRY_CODE
import app.ehrenamtskarte.backend.stores.MISCELLANEOUS_CATEGORY_ID
import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.replaceNa
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import app.ehrenamtskarte.backend.stores.importer.types.LbeAcceptingStore
import org.apache.commons.text.StringEscapeUtils
import org.slf4j.Logger

const val MISCELLANEOUS_CATEGORY = 9
const val ALTERNATIVE_MISCELLANEOUS_CATEGORY = 99

class MapFromLbe(private val logger: Logger) : PipelineStep<List<LbeAcceptingStore>, List<AcceptingStore>>() {
override fun execute(input: List<LbeAcceptingStore>) = input.mapNotNull {
try {
Expand Down Expand Up @@ -41,11 +41,18 @@ class MapFromLbe(private val logger: Logger) : PipelineStep<List<LbeAcceptingSto

private fun categoryId(category: String): Int {
val int = category.toInt()
return if (int == ALTERNATIVE_MISCELLANEOUS_CATEGORY) MISCELLANEOUS_CATEGORY else int
return if (int == ALTERNATIVE_MISCELLANEOUS_CATEGORY_ID) MISCELLANEOUS_CATEGORY_ID else int
}

private fun String.decodeSpecialCharacters(): String {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved from old step Encode

// We often get a double encoded string, i.e. &amp;amp;
return StringEscapeUtils
.unescapeHtml4(StringEscapeUtils.unescapeHtml4(this))
.replace("<br/>", "\n")
}

private fun String?.clean(removeSubsequentWhitespaces: Boolean = true): String? {
val trimmed = this?.replaceNa()?.trim()
val trimmed = this?.replaceNa()?.trim()?.decodeSpecialCharacters()
if (removeSubsequentWhitespaces) {
return trimmed?.replace(Regex("""\s{2,}"""), " ")
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package app.ehrenamtskarte.backend.stores.importer.steps

import app.ehrenamtskarte.backend.stores.geocoding.FeatureFetcher
import app.ehrenamtskarte.backend.common.STATE
import app.ehrenamtskarte.backend.stores.STATE
import app.ehrenamtskarte.backend.stores.geocoding.isInBoundingBox
import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package app.ehrenamtskarte.backend.stores.importer.steps

import app.ehrenamtskarte.backend.common.STREET_EXCLUDE_PATTERN
import app.ehrenamtskarte.backend.stores.STREET_EXCLUDE_PATTERN
import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.logChange
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package app.ehrenamtskarte.backend.stores.importer.steps

import app.ehrenamtskarte.backend.common.STREET_EXCLUDE_PATTERN
import app.ehrenamtskarte.backend.stores.STREET_EXCLUDE_PATTERN
import app.ehrenamtskarte.backend.stores.geocoding.FeatureFetcher
import app.ehrenamtskarte.backend.stores.geocoding.isCloseToBoundingBox
import app.ehrenamtskarte.backend.stores.geocoding.isInBoundingBox
Expand Down