Skip to content

Commit

Permalink
Merge pull request #468 from digitalfabrik/467-sanitize-street-locati…
Browse files Browse the repository at this point in the history
…on-house

Sanitize street and house numbers
  • Loading branch information
steffenkleinle authored Jan 31, 2022
2 parents 5f0564d + bef94b5 commit 6bd717e
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 18 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package app.ehrenamtskarte.backend.stores.importer

import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import org.slf4j.Logger

fun Logger.logChange(storeInfo: String, property: String, oldValue: String?, newValue: String?) {
if (oldValue == newValue) {
info("$property of '$storeInfo' could not be improved, keeping '$oldValue'")
} else {
info("$property of '$storeInfo' changed to '$newValue' from '$oldValue'")
}
}

fun Logger.logChange(store: AcceptingStore, property: String, oldValue: String?, newValue: String?) {
val storeInfo = listOfNotNull(store.name, store.location, store.street, store.houseNumber).joinToString()
logChange(storeInfo, property, oldValue, newValue)
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,94 @@ package app.ehrenamtskarte.backend.stores.importer.steps

import app.ehrenamtskarte.backend.common.COUNTRY_CODE
import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.logChange
import app.ehrenamtskarte.backend.stores.importer.replaceNa
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import app.ehrenamtskarte.backend.stores.importer.types.LbeAcceptingStore
import org.intellij.lang.annotations.Language
import org.slf4j.Logger

const val MISCELLANEOUS_CATEGORY = 9
const val ALTERNATIVE_MISCELLANEOUS_CATEGORY = 99

class Map(private val logger: Logger) : PipelineStep<List<LbeAcceptingStore>, List<AcceptingStore>>() {
private val houseNumberRegex = houseNumberRegex()

override fun execute(input: List<LbeAcceptingStore>) = input.mapNotNull {
try {
AcceptingStore(
it.name.clean()!!,
COUNTRY_CODE,
it.location.clean()!!,
cleanPostalCode(it.postalCode),
it.cleanPostalCode(),
it.street.clean(),
it.houseNumber.clean(),
null,
it.longitude.safeToDouble(),
it.latitude.safeToDouble(),
categoryId(it.category!!),
it.email.clean(),
it.telephone.clean(),
it.homepage.clean(),
it.discount.clean(false)
)
).sanitizeStreetHouseNumber()
} catch (e: Exception) {
logger.info("Exception occurred while mapping $it", e)
null
}
}

private fun houseNumberRegex(): Regex {
// E.g. "B[200]", "H[7]" (mostly in industrial parks)
@Language("RegExp")
val prefix = """[A-Z]?"""

// E.g. "[5] - 7", "[2]+3" or "[11] und 12"
@Language("RegExp")
val range = """\s?(-|\+|u\.|und|/)\s?[0-9]+"""

// E.g. "[13] 1/2" or "[1] 3/4"
@Language("RegExp")
val fraction = """\s?[0-9]/[0-9]"""

// E.g. "[12]a" or "[2] B" (must be followed by a whitespace or the end of the string)
@Language("RegExp")
val letter = """\s?[a-zA-Z]($|\s)"""

return Regex("""$prefix[0-9]+(($range)|($fraction)|($letter))?""")
}

private fun AcceptingStore.sanitizeStreetHouseNumber(): AcceptingStore {
val isStreetPolluted = street?.find { it.isDigit() } != null
val isHouseNumberPolluted = houseNumber != null && !houseNumberRegex.matches(houseNumber)

if (isStreetPolluted || isHouseNumberPolluted) {
val address = listOfNotNull(street, houseNumber).joinToString(" ")
val houseNumberMatch = houseNumberRegex.find(address)

if (houseNumberMatch == null) {
// No house number, the whole address is the street
logger.logChange("$name, $location", "Address", "$street|$houseNumber", address)
return copy(street = address, houseNumber = null)
}

val cleanStreet = address.substring(0, houseNumberMatch.range.first).trim()
val cleanHouseNumber = houseNumberMatch.value.toLowerCase().trim()

// Residue that is neither the street nor the house number, e.g. "im Hauptbahnhof", "Ecke Theaterstraße"
val residue = if (houseNumberMatch.range.last < address.length - 1) {
val res = address.substring(houseNumberMatch.range.last + 1).trim { !it.isLetterOrDigit() }.clean()
if (res != cleanHouseNumber) res else null
} else null

val newAddress = listOfNotNull(cleanStreet, cleanHouseNumber, residue).joinToString("|")
logger.logChange("$name, $location", "Address", "$street|$houseNumber", newAddress)

return copy(street = cleanStreet, houseNumber = cleanHouseNumber, additionalAddressInformation = residue)
}
return this
}

private fun String?.safeToDouble(): Double? {
return this?.clean()?.replace(",", ".")?.toDouble()
}
Expand All @@ -52,10 +107,15 @@ class Map(private val logger: Logger) : PipelineStep<List<LbeAcceptingStore>, Li
return trimmed
}

private fun cleanPostalCode(postalCode: String?): String? {
if (postalCode == null) return null
val fiveDigitRegex = """\d{5}""".toRegex()
return fiveDigitRegex.find(postalCode)?.value
private fun LbeAcceptingStore.cleanPostalCode(): String? {
val oldPostalCode = postalCode ?: return null
val fiveDigitRegex = Regex("""[0-9]{5}""")

val newPostalCode = fiveDigitRegex.find(oldPostalCode)?.value
if (newPostalCode != oldPostalCode.clean()) {
logger.logChange("$name, $location", "Postal code", oldPostalCode, newPostalCode)
}
return newPostalCode
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import app.ehrenamtskarte.backend.stores.geocoding.FeatureFetcher
import app.ehrenamtskarte.backend.stores.geocoding.isCloseToBoundingBox
import app.ehrenamtskarte.backend.stores.geocoding.isInBoundingBox
import app.ehrenamtskarte.backend.stores.importer.PipelineStep
import app.ehrenamtskarte.backend.stores.importer.logChange
import app.ehrenamtskarte.backend.stores.importer.types.AcceptingStore
import io.ktor.client.HttpClient
import kotlinx.coroutines.runBlocking
Expand Down Expand Up @@ -43,14 +44,14 @@ class Sanitize(private val logger: Logger, httpClient: HttpClient) : PipelineSte
val oldCoordinates = "$latitude, $longitude"
val newCoordinates = "${feature.latitude()}, ${feature.longitude()}"

logChange(this, "Coordinates", oldCoordinates, newCoordinates)
logger.logChange(this, "Coordinates", oldCoordinates, newCoordinates)

return copy(longitude = feature.longitude(), latitude = feature.latitude())
}

// Match by coordinates -> replace wrong postal code
val newPostalCode = feature?.postalCode() ?: postalCode
logChange(this, "Postal code", postalCode, newPostalCode)
logger.logChange(this, "Postal code", postalCode, newPostalCode)

return copy(postalCode = newPostalCode)
}
Expand All @@ -59,16 +60,6 @@ class Sanitize(private val logger: Logger, httpClient: HttpClient) : PipelineSte
return this
}

private fun logChange(store: AcceptingStore, property: String, oldValue: String?, newValue: String?) {
val storeInfo = listOfNotNull(store.name, store.location, store.street, store.houseNumber).joinToString()

if (oldValue == newValue) {
logger.info("$property of '$storeInfo' could not be improved, keeping '$oldValue'")
} else {
logger.info("$property of '$storeInfo' changed from '$oldValue' to '$newValue'")
}
}

private fun Feature.latitude(): Double = (geometry as Point).coordinates.latitude
private fun Feature.longitude(): Double = (geometry as Point).coordinates.longitude
private fun Feature.postalCode(): String? = address()["postcode"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ data class AcceptingStore(
val postalCode: String?,
val street: String?,
val houseNumber: String?,
val additionalAddressInformation: String?,
val longitude: Double?,
val latitude: Double?,
val categoryId: Int,
Expand Down

0 comments on commit 6bd717e

Please sign in to comment.