From 5a6a892fd5cda90998889531d76eb3496f6fc253 Mon Sep 17 00:00:00 2001 From: Steffen Kleinle Date: Fri, 21 Jan 2022 12:07:33 +0100 Subject: [PATCH 1/6] Sanitize street and house number --- .../backend/stores/importer/steps/Map.kt | 37 ++++++++++++++++++- .../stores/importer/types/AcceptingStore.kt | 1 + 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt index fbfc28c2e..d700317fa 100644 --- a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt @@ -5,6 +5,7 @@ import app.ehrenamtskarte.backend.stores.importer.PipelineStep import app.ehrenamtskarte.backend.stores.importer.replaceNa import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore import app.ehrenamtskarte.backend.stores.importer.types.LbeAcceptingStore +import org.intellij.lang.annotations.Language import org.slf4j.Logger const val MISCELLANEOUS_CATEGORY = 9 @@ -21,6 +22,7 @@ class Map(private val logger: Logger) : PipelineStep, Li cleanPostalCode(it.postalCode), it.street.clean(), it.houseNumber.clean(), + null, it.longitude.safeToDouble(), it.latitude.safeToDouble(), categoryId(it.category!!), @@ -28,13 +30,46 @@ class Map(private val logger: Logger) : PipelineStep, Li it.telephone.clean(), it.homepage.clean(), it.discount.clean() - ) + ).sanitizeStreetHouseNumber() } catch (e: Exception) { logger.info("Exception occurred while mapping $it", e) null } } + private fun houseNumberRegex(): Regex { + // House number range, e.g. "[5] - 7", "[2]+3" or "[11] und 12" + @Language("RegExp") + val houseNumberRange = """\s?(-|\+|u\.|und)\s?[0-9]+""" + + // Additional house number info, e.g. "[13] 1/2" or "[1] 3/4" (must not be followed by another digit) + @Language("RegExp") + val houseNumberAdditionFraction = """\s?[0-9]/[0-9]""" + + // Additional house number info, e.g. "[12]a" or "[2] B" (must be followed by a whitespace or the end of the string) + @Language("RegExp") + val houseNumberAdditionLetter = """\s?[a-zA-Z]$|\s""" + + return """[0-9]+(($houseNumberRange)|($houseNumberAdditionFraction)|($houseNumberAdditionLetter))?""".toRegex() + } + + private fun AcceptingStore.sanitizeStreetHouseNumber(): AcceptingStore { + val indexOfDigitInStreet = street?.indexOfFirst { it.isDigit() } ?: -1 + if (street == null || indexOfDigitInStreet < 0) return this + + val cleanedStreet = street.substring(0, indexOfDigitInStreet).trim() // Everything before the first digit + val splitHouseNumber = street.substring(indexOfDigitInStreet).trim() // Everything after the first digit + + val regex = houseNumberRegex() + val completeHouseNumber = if (houseNumber != null && splitHouseNumber != houseNumber) "$splitHouseNumber $houseNumber" else splitHouseNumber + val cleanedHouseNumber = regex.find(completeHouseNumber)!!.value.toLowerCase().trim() + + // Residue that is neither the street nor the house number, e.g. "im Hauptbahnhof", "Ecke Theaterstraße" + val additionalInformation = regex.replaceFirst(completeHouseNumber, "").trim { !it.isLetterOrDigit() }.replaceNa() + + return copy(street = cleanedStreet, houseNumber = cleanedHouseNumber, additionalAddressInformation = additionalInformation) + } + private fun String?.safeToDouble(): Double? { return this?.clean()?.replace(",", ".")?.toDouble() } diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/types/AcceptingStore.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/types/AcceptingStore.kt index 2b20689dd..3ae4d1f0c 100644 --- a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/types/AcceptingStore.kt +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/types/AcceptingStore.kt @@ -7,6 +7,7 @@ data class AcceptingStore( val postalCode: String?, val street: String?, val houseNumber: String?, + val additionalAddressInformation: String?, val longitude: Double?, val latitude: Double?, val categoryId: Int, From e4dcc729869fc099545a1739a230e07d39d046af Mon Sep 17 00:00:00 2001 From: Steffen Kleinle Date: Fri, 21 Jan 2022 15:23:28 +0100 Subject: [PATCH 2/6] Exclude logChange method --- .../backend/stores/importer/logging.kt | 17 +++++++++++++++++ .../backend/stores/importer/steps/Sanitize.kt | 15 +++------------ 2 files changed, 20 insertions(+), 12 deletions(-) create mode 100644 backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/logging.kt diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/logging.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/logging.kt new file mode 100644 index 000000000..f30e3c835 --- /dev/null +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/logging.kt @@ -0,0 +1,17 @@ +package app.ehrenamtskarte.backend.stores.importer + +import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore +import org.slf4j.Logger + +fun Logger.logChange(storeInfo: String, property: String, oldValue: String?, newValue: String?) { + if (oldValue == newValue) { + info("$property of '$storeInfo' could not be improved, keeping '$oldValue'") + } else { + info("$property of '$storeInfo' changed from '$oldValue' to '$newValue'") + } +} + +fun Logger.logChange(store: AcceptingStore, property: String, oldValue: String?, newValue: String?) { + val storeInfo = listOfNotNull(store.name, store.location, store.street, store.houseNumber).joinToString() + logChange(storeInfo, property, oldValue, newValue) +} diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Sanitize.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Sanitize.kt index c278338ec..d678f14c1 100644 --- a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Sanitize.kt +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Sanitize.kt @@ -4,6 +4,7 @@ import app.ehrenamtskarte.backend.stores.geocoding.FeatureFetcher import app.ehrenamtskarte.backend.stores.geocoding.isCloseToBoundingBox import app.ehrenamtskarte.backend.stores.geocoding.isInBoundingBox import app.ehrenamtskarte.backend.stores.importer.PipelineStep +import app.ehrenamtskarte.backend.stores.importer.logChange import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore import io.ktor.client.HttpClient import kotlinx.coroutines.runBlocking @@ -43,14 +44,14 @@ class Sanitize(private val logger: Logger, httpClient: HttpClient) : PipelineSte val oldCoordinates = "$latitude, $longitude" val newCoordinates = "${feature.latitude()}, ${feature.longitude()}" - logChange(this, "Coordinates", oldCoordinates, newCoordinates) + logger.logChange(this, "Coordinates", oldCoordinates, newCoordinates) return copy(longitude = feature.longitude(), latitude = feature.latitude()) } // Match by coordinates -> replace wrong postal code val newPostalCode = feature?.postalCode() ?: postalCode - logChange(this, "Postal code", postalCode, newPostalCode) + logger.logChange(this, "Postal code", postalCode, newPostalCode) return copy(postalCode = newPostalCode) } @@ -59,16 +60,6 @@ class Sanitize(private val logger: Logger, httpClient: HttpClient) : PipelineSte return this } - private fun logChange(store: AcceptingStore, property: String, oldValue: String?, newValue: String?) { - val storeInfo = listOfNotNull(store.name, store.location, store.street, store.houseNumber).joinToString() - - if (oldValue == newValue) { - logger.info("$property of '$storeInfo' could not be improved, keeping '$oldValue'") - } else { - logger.info("$property of '$storeInfo' changed from '$oldValue' to '$newValue'") - } - } - private fun Feature.latitude(): Double = (geometry as Point).coordinates.latitude private fun Feature.longitude(): Double = (geometry as Point).coordinates.longitude private fun Feature.postalCode(): String? = address()["postcode"] From 2cdf64f957d9b7b32e4ca5df2e6bf96c8f151a46 Mon Sep 17 00:00:00 2001 From: Steffen Kleinle Date: Fri, 21 Jan 2022 15:28:09 +0100 Subject: [PATCH 3/6] Log postal code sanitizing --- .../backend/stores/importer/steps/Map.kt | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt index d700317fa..a6ab074d1 100644 --- a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt @@ -2,6 +2,7 @@ package app.ehrenamtskarte.backend.stores.importer.steps import app.ehrenamtskarte.backend.common.COUNTRY_CODE import app.ehrenamtskarte.backend.stores.importer.PipelineStep +import app.ehrenamtskarte.backend.stores.importer.logChange import app.ehrenamtskarte.backend.stores.importer.replaceNa import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore import app.ehrenamtskarte.backend.stores.importer.types.LbeAcceptingStore @@ -19,7 +20,7 @@ class Map(private val logger: Logger) : PipelineStep, Li it.name!!.trim(), COUNTRY_CODE, it.location!!.trim(), - cleanPostalCode(it.postalCode), + it.cleanPostalCode(), it.street.clean(), it.houseNumber.clean(), null, @@ -83,10 +84,15 @@ class Map(private val logger: Logger) : PipelineStep, Li return this?.replaceNa()?.trim() } - private fun cleanPostalCode(postalCode: String?): String? { - if (postalCode == null) return null - val fiveDigitRegex = """\d{5}""".toRegex() - return fiveDigitRegex.find(postalCode)?.value + private fun LbeAcceptingStore.cleanPostalCode(): String? { + val oldPostalCode = postalCode ?: return null + val fiveDigitRegex = """[0-9]{5}""".toRegex() + + val newPostalCode = fiveDigitRegex.find(oldPostalCode)?.value + if (newPostalCode != oldPostalCode.clean()) { + logger.logChange("$name, $location", "Postal code", oldPostalCode, newPostalCode) + } + return newPostalCode } } From 41181ee22f49f40dc0150a6908d34b76009fe8ba Mon Sep 17 00:00:00 2001 From: Steffen Kleinle Date: Fri, 21 Jan 2022 15:32:04 +0100 Subject: [PATCH 4/6] Sanitize street and house number --- .../backend/stores/importer/steps/Map.kt | 58 +++++++++++++------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt index a6ab074d1..50343c9b2 100644 --- a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt @@ -13,6 +13,7 @@ const val MISCELLANEOUS_CATEGORY = 9 const val ALTERNATIVE_MISCELLANEOUS_CATEGORY = 99 class Map(private val logger: Logger) : PipelineStep, List>() { + private val houseNumberRegex = houseNumberRegex() override fun execute(input: List) = input.mapNotNull { try { @@ -39,9 +40,13 @@ class Map(private val logger: Logger) : PipelineStep, Li } private fun houseNumberRegex(): Regex { + // House number prefix, e.g. "B[200]", "H[7]" (mostly in industrial parks) + @Language("RegExp") + val houseNumberPrefix = """[A-Z]?""" + // House number range, e.g. "[5] - 7", "[2]+3" or "[11] und 12" @Language("RegExp") - val houseNumberRange = """\s?(-|\+|u\.|und)\s?[0-9]+""" + val houseNumberRange = """\s?(-|\+|u\.|und|/)\s?[0-9]+""" // Additional house number info, e.g. "[13] 1/2" or "[1] 3/4" (must not be followed by another digit) @Language("RegExp") @@ -49,26 +54,45 @@ class Map(private val logger: Logger) : PipelineStep, Li // Additional house number info, e.g. "[12]a" or "[2] B" (must be followed by a whitespace or the end of the string) @Language("RegExp") - val houseNumberAdditionLetter = """\s?[a-zA-Z]$|\s""" + val houseNumberAdditionLetter = """\s?[a-zA-Z]($|\s)""" - return """[0-9]+(($houseNumberRange)|($houseNumberAdditionFraction)|($houseNumberAdditionLetter))?""".toRegex() + return """$houseNumberPrefix[0-9]+(($houseNumberRange)|($houseNumberAdditionFraction)|($houseNumberAdditionLetter))?""".toRegex() } private fun AcceptingStore.sanitizeStreetHouseNumber(): AcceptingStore { - val indexOfDigitInStreet = street?.indexOfFirst { it.isDigit() } ?: -1 - if (street == null || indexOfDigitInStreet < 0) return this - - val cleanedStreet = street.substring(0, indexOfDigitInStreet).trim() // Everything before the first digit - val splitHouseNumber = street.substring(indexOfDigitInStreet).trim() // Everything after the first digit - - val regex = houseNumberRegex() - val completeHouseNumber = if (houseNumber != null && splitHouseNumber != houseNumber) "$splitHouseNumber $houseNumber" else splitHouseNumber - val cleanedHouseNumber = regex.find(completeHouseNumber)!!.value.toLowerCase().trim() - - // Residue that is neither the street nor the house number, e.g. "im Hauptbahnhof", "Ecke Theaterstraße" - val additionalInformation = regex.replaceFirst(completeHouseNumber, "").trim { !it.isLetterOrDigit() }.replaceNa() - - return copy(street = cleanedStreet, houseNumber = cleanedHouseNumber, additionalAddressInformation = additionalInformation) + val isStreetPolluted = street?.find { it.isDigit() } != null + val isHouseNumberPolluted = houseNumber != null && !houseNumberRegex.matches(houseNumber) + + if (isStreetPolluted || isHouseNumberPolluted) { + val address = listOfNotNull(street, houseNumber).joinToString(" ").replace(Regex("""\s{2,}"""), " ") + val match = houseNumberRegex.find(address) + + if (match == null) { + // No house number, the whole address is the street + logger.logChange(this, "Address", "$street|$houseNumber", address) + return copy(street = address, houseNumber = null) + } + + val cleanStreet = address.substring(0, match.range.first).trim() + val cleanHouseNumber = match.value.toLowerCase().trim() + + // Residue that is neither the street nor the house number, e.g. "im Hauptbahnhof", "Ecke Theaterstraße" + val cleanAdditionalInformation = if (match.range.last < address.length - 1) { + val additionalInformation = + address.substring(match.range.last + 1).trim { !it.isLetterOrDigit() }.clean() + if (additionalInformation != cleanHouseNumber) additionalInformation else null + } else null + + val newAddress = listOfNotNull(cleanStreet, cleanHouseNumber, cleanAdditionalInformation).joinToString("|") + logger.logChange(this, "Address", "$street|$houseNumber", newAddress) + + return copy( + street = cleanStreet, + houseNumber = cleanHouseNumber, + additionalAddressInformation = cleanAdditionalInformation + ) + } + return this } private fun String?.safeToDouble(): Double? { From d9f09a9588071293cc37eb75d149e54e560a3e63 Mon Sep 17 00:00:00 2001 From: Steffen Kleinle Date: Fri, 21 Jan 2022 15:52:37 +0100 Subject: [PATCH 5/6] Change order of logging --- .../app/ehrenamtskarte/backend/stores/importer/logging.kt | 2 +- .../app/ehrenamtskarte/backend/stores/importer/steps/Map.kt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/logging.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/logging.kt index f30e3c835..da91ce609 100644 --- a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/logging.kt +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/logging.kt @@ -7,7 +7,7 @@ fun Logger.logChange(storeInfo: String, property: String, oldValue: String?, new if (oldValue == newValue) { info("$property of '$storeInfo' could not be improved, keeping '$oldValue'") } else { - info("$property of '$storeInfo' changed from '$oldValue' to '$newValue'") + info("$property of '$storeInfo' changed to '$newValue' from '$oldValue'") } } diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt index 50343c9b2..ce8a0ceec 100644 --- a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt @@ -69,7 +69,7 @@ class Map(private val logger: Logger) : PipelineStep, Li if (match == null) { // No house number, the whole address is the street - logger.logChange(this, "Address", "$street|$houseNumber", address) + logger.logChange("$name, $location", "Address", "$street|$houseNumber", address) return copy(street = address, houseNumber = null) } @@ -84,7 +84,7 @@ class Map(private val logger: Logger) : PipelineStep, Li } else null val newAddress = listOfNotNull(cleanStreet, cleanHouseNumber, cleanAdditionalInformation).joinToString("|") - logger.logChange(this, "Address", "$street|$houseNumber", newAddress) + logger.logChange("$name, $location", "Address", "$street|$houseNumber", newAddress) return copy( street = cleanStreet, From 2e8b69e003fbfef0605fb517724080e0093350dd Mon Sep 17 00:00:00 2001 From: Steffen Kleinle Date: Fri, 21 Jan 2022 16:34:21 +0100 Subject: [PATCH 6/6] Clean up --- .../backend/stores/importer/steps/Map.kt | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt index ce8a0ceec..8d0ab1b4e 100644 --- a/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt +++ b/backend/src/main/kotlin/app/ehrenamtskarte/backend/stores/importer/steps/Map.kt @@ -40,23 +40,23 @@ class Map(private val logger: Logger) : PipelineStep, Li } private fun houseNumberRegex(): Regex { - // House number prefix, e.g. "B[200]", "H[7]" (mostly in industrial parks) + // E.g. "B[200]", "H[7]" (mostly in industrial parks) @Language("RegExp") - val houseNumberPrefix = """[A-Z]?""" + val prefix = """[A-Z]?""" - // House number range, e.g. "[5] - 7", "[2]+3" or "[11] und 12" + // E.g. "[5] - 7", "[2]+3" or "[11] und 12" @Language("RegExp") - val houseNumberRange = """\s?(-|\+|u\.|und|/)\s?[0-9]+""" + val range = """\s?(-|\+|u\.|und|/)\s?[0-9]+""" - // Additional house number info, e.g. "[13] 1/2" or "[1] 3/4" (must not be followed by another digit) + // E.g. "[13] 1/2" or "[1] 3/4" @Language("RegExp") - val houseNumberAdditionFraction = """\s?[0-9]/[0-9]""" + val fraction = """\s?[0-9]/[0-9]""" - // Additional house number info, e.g. "[12]a" or "[2] B" (must be followed by a whitespace or the end of the string) + // E.g. "[12]a" or "[2] B" (must be followed by a whitespace or the end of the string) @Language("RegExp") - val houseNumberAdditionLetter = """\s?[a-zA-Z]($|\s)""" + val letter = """\s?[a-zA-Z]($|\s)""" - return """$houseNumberPrefix[0-9]+(($houseNumberRange)|($houseNumberAdditionFraction)|($houseNumberAdditionLetter))?""".toRegex() + return Regex("""$prefix[0-9]+(($range)|($fraction)|($letter))?""") } private fun AcceptingStore.sanitizeStreetHouseNumber(): AcceptingStore { @@ -64,33 +64,28 @@ class Map(private val logger: Logger) : PipelineStep, Li val isHouseNumberPolluted = houseNumber != null && !houseNumberRegex.matches(houseNumber) if (isStreetPolluted || isHouseNumberPolluted) { - val address = listOfNotNull(street, houseNumber).joinToString(" ").replace(Regex("""\s{2,}"""), " ") - val match = houseNumberRegex.find(address) + val address = listOfNotNull(street, houseNumber).joinToString(" ") + val houseNumberMatch = houseNumberRegex.find(address) - if (match == null) { + if (houseNumberMatch == null) { // No house number, the whole address is the street logger.logChange("$name, $location", "Address", "$street|$houseNumber", address) return copy(street = address, houseNumber = null) } - val cleanStreet = address.substring(0, match.range.first).trim() - val cleanHouseNumber = match.value.toLowerCase().trim() + val cleanStreet = address.substring(0, houseNumberMatch.range.first).trim() + val cleanHouseNumber = houseNumberMatch.value.toLowerCase().trim() // Residue that is neither the street nor the house number, e.g. "im Hauptbahnhof", "Ecke Theaterstraße" - val cleanAdditionalInformation = if (match.range.last < address.length - 1) { - val additionalInformation = - address.substring(match.range.last + 1).trim { !it.isLetterOrDigit() }.clean() - if (additionalInformation != cleanHouseNumber) additionalInformation else null + val residue = if (houseNumberMatch.range.last < address.length - 1) { + val res = address.substring(houseNumberMatch.range.last + 1).trim { !it.isLetterOrDigit() }.clean() + if (res != cleanHouseNumber) res else null } else null - val newAddress = listOfNotNull(cleanStreet, cleanHouseNumber, cleanAdditionalInformation).joinToString("|") + val newAddress = listOfNotNull(cleanStreet, cleanHouseNumber, residue).joinToString("|") logger.logChange("$name, $location", "Address", "$street|$houseNumber", newAddress) - return copy( - street = cleanStreet, - houseNumber = cleanHouseNumber, - additionalAddressInformation = cleanAdditionalInformation - ) + return copy(street = cleanStreet, houseNumber = cleanHouseNumber, additionalAddressInformation = residue) } return this } @@ -110,7 +105,7 @@ class Map(private val logger: Logger) : PipelineStep, Li private fun LbeAcceptingStore.cleanPostalCode(): String? { val oldPostalCode = postalCode ?: return null - val fiveDigitRegex = """[0-9]{5}""".toRegex() + val fiveDigitRegex = Regex("""[0-9]{5}""") val newPostalCode = fiveDigitRegex.find(oldPostalCode)?.value if (newPostalCode != oldPostalCode.clean()) {