Skip to content

Commit

Permalink
traversable fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
propi committed Feb 27, 2018
1 parent 370378a commit a7872ad
Show file tree
Hide file tree
Showing 15 changed files with 99 additions and 105 deletions.
4 changes: 2 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name := "easyminer-discretization"
name := "EasyMiner-Discretization"

organization := "eu.easyminer"
organization := "com.github.KIZI"

version := "1.1.0"

Expand Down
2 changes: 2 additions & 0 deletions src/main/java/eu/easyminer/discretization/Interval.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,6 @@ public interface Interval {

Boolean isRightBoundClosed();

Boolean isInInterval(double value);

}
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ trait Discretization[T] {

implicit val n: Numeric[T]

def discretize(data: Traversable[T]): Traversable[impl.Interval]
def discretize(data: Traversable[T]): Array[impl.Interval]

}

object Discretization {

object Exceptions {

class IllegalTypeOfIterable(expected: Class[_], given: Class[_]) extends Exception("Illegal type of input iterable. Expected: " + expected.getSimpleName + ", given: " + given.getSimpleName)
class IllegalTypeOfTraversable(expected: Class[_], given: Class[_]) extends Exception("Illegal type of input traversable. Expected: " + expected.getSimpleName + ", given: " + given.getSimpleName)

object UnsupportedDiscretizationTask extends Exception("Unsupported discretization task.")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package eu.easyminer.discretization.algorithm

import eu.easyminer.discretization.DiscretizationTask
import eu.easyminer.discretization.algorithm.DiscretizationTaskValidator.Exceptions.InvalidDiscretizationTask
import eu.easyminer.discretization.impl.{AbsoluteSupport, RelativeSupport, Support}
import eu.easyminer.discretization.impl.Support
import eu.easyminer.discretization.task.{EquidistanceDiscretizationTask, EquifrequencyDiscretizationTask, EquisizeDiscretizationTask}

/**
Expand Down Expand Up @@ -33,8 +33,8 @@ object DiscretizationTaskValidator {
implicit val equifrequencyDiscretizationTaskValidator: DiscretizationTaskValidator[EquifrequencyDiscretizationTask] = (dt: EquifrequencyDiscretizationTask) => throwIfFalse("Number of bins must be greater than zero.")(dt.getNumberOfBins > 0)

implicit val equisizeDiscretizationTaskValidator: DiscretizationTaskValidator[EquisizeDiscretizationTask] = (dt: EquisizeDiscretizationTask) => (dt.getMinSupport: Support) match {
case AbsoluteSupport(s) => throwIfFalse("Absolute support must be greater than 1.")(s > 1)
case RelativeSupport(s) => throwIfFalse("Relative support must be greater than zero and lower than 1")(s > 0 && s < 1)
case Support.Absolute(s) => throwIfFalse("Absolute support must be greater than 1.")(s > 1)
case Support.Relative(s) => throwIfFalse("Relative support must be greater than zero and lower than 1")(s > 0 && s < 1)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,18 @@ import eu.easyminer.discretization.impl.{Interval, IntervalBound}
*/
class EquidistantIntervals[T] private[algorithm](bins: Int)(implicit val n: Numeric[T]) extends Discretization[T] {

def discretize(data: Traversable[T]): Traversable[impl.Interval] = new Traversable[impl.Interval] {
def foreach[U](f: Interval => U): Unit = {
data.view
.map(x => (x, x))
.reduceOption((x, y) => n.min(x._1, y._1) -> n.max(x._2, y._2))
.map(x => n.toDouble(x._1) -> n.toDouble(x._2))
.toIterator
.flatMap { case (min, max) =>
val intervalSize = (max - min) / bins
for (binNumber <- 0 until bins) yield {
val leftBound = IntervalBound.Inclusive(min + intervalSize * binNumber)
val rightBound = if (binNumber + 1 == bins) IntervalBound.Inclusive(max) else IntervalBound.Exclusive(leftBound.value + intervalSize)
Interval(leftBound, rightBound)
}
}.foreach(f)
}
}
def discretize(data: Traversable[T]): Array[impl.Interval] = data.view
.map(x => (x, x))
.reduceOption((x, y) => n.min(x._1, y._1) -> n.max(x._2, y._2))
.map(x => n.toDouble(x._1) -> n.toDouble(x._2))
.toIterator
.flatMap { case (min, max) =>
val intervalSize = (max - min) / bins
for (binNumber <- 0 until bins) yield {
val leftBound = IntervalBound.Inclusive(min + intervalSize * binNumber)
val rightBound = if (binNumber + 1 == bins) IntervalBound.Inclusive(max) else IntervalBound.Exclusive(leftBound.value + intervalSize)
Interval(leftBound, rightBound)
}
}.toArray

}
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
package eu.easyminer.discretization.algorithm

import eu.easyminer.discretization.algorithm.CutpointsResolver._
import eu.easyminer.discretization.algorithm.Discretization.Exceptions.IllegalTypeOfIterable
import eu.easyminer.discretization.algorithm.Discretization.Exceptions.IllegalTypeOfTraversable
import eu.easyminer.discretization.algorithm.IntervalSmoothing._
import eu.easyminer.discretization.impl.sorting.SortedTraversable
import eu.easyminer.discretization.impl.{InclusiveIntervalBound, Interval, IntervalFrequency, ValueFrequency}
import eu.easyminer.discretization.impl._

/**
* Created by propan on 18. 3. 2017.
*/
class EquifrequentIntervals[T] private[algorithm](bins: Int)(implicit val n: Numeric[T]) extends Discretization[T] {

private def countOptimalFrequency(data: Iterable[T]) = {
val dataCount = data.iterator.size
private def countOptimalFrequency(data: Traversable[T]) = {
val dataCount = data.size
math.ceil(dataCount / bins).toInt
}

private def searchIntervals(data: Iterable[ValueFrequency[T]], optimalFrequency: Int) = {
private def searchIntervals(data: Traversable[ValueFrequency[T]], optimalFrequency: Int) = {
val intervals = new collection.mutable.ArrayBuffer[IntervalFrequency](bins)
for (value <- data.iterator) {
for (value <- data) {
intervals
.lastOption
.filter(interval => intervals.length == bins || math.abs(optimalFrequency - (interval.frequency + value.frequency)) < math.abs(optimalFrequency - interval.frequency)) match {
case Some(interval) => intervals.update(intervals.length - 1, IntervalFrequency(interval.interval.copy(maxValue = InclusiveIntervalBound(n.toDouble(value.value))), interval.frequency + value.frequency))
case Some(interval) => intervals.update(intervals.length - 1, IntervalFrequency(interval.interval.copy(maxValue = IntervalBound.Inclusive(n.toDouble(value.value))), interval.frequency + value.frequency))
case None =>
val leftRightBound = InclusiveIntervalBound(n.toDouble(value.value))
val leftRightBound = IntervalBound.Inclusive(n.toDouble(value.value))
intervals += IntervalFrequency(Interval(leftRightBound, leftRightBound), value.frequency)
}
}
Expand Down Expand Up @@ -60,14 +60,14 @@ class EquifrequentIntervals[T] private[algorithm](bins: Int)(implicit val n: Num
}


def discretize(data: Traversable[T]): Traversable[Interval] = data match {
def discretize(data: Traversable[T]): Array[Interval] = data match {
case data: SortedTraversable[T] =>
val optimalFrequency = countOptimalFrequency(data)
val intervals = searchIntervals(data, optimalFrequency)
smoothIntervals(intervals, data, 1000000)(canItMoveLeft(optimalFrequency))(canItMoveRight(optimalFrequency))
resolveCutpoints(intervals)
intervals.iterator.map(_.interval).toList
case _ => throw new IllegalTypeOfIterable(classOf[SortedTraversable[T]], data.getClass)
intervals.iterator.map(_.interval).toArray
case _ => throw new IllegalTypeOfTraversable(classOf[SortedTraversable[T]], data.getClass)
}

}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package eu.easyminer.discretization.algorithm

import eu.easyminer.discretization.algorithm.CutpointsResolver._
import eu.easyminer.discretization.algorithm.Discretization.Exceptions.IllegalTypeOfIterable
import eu.easyminer.discretization.algorithm.Discretization.Exceptions.IllegalTypeOfTraversable
import eu.easyminer.discretization.algorithm.IntervalSmoothing._
import eu.easyminer.discretization.impl._
import eu.easyminer.discretization.impl.sorting.SortedTraversable
Expand All @@ -11,20 +11,20 @@ import eu.easyminer.discretization.impl.sorting.SortedTraversable
*/
class EquisizedIntervals[T] private[algorithm](minSupport: Support)(implicit val n: Numeric[T]) extends Discretization[T] {

private def countOptimalFrequency(data: Iterable[T]) = minSupport match {
case RelativeSupport(minSupport) => math.ceil(data.iterator.size * minSupport).toInt
case AbsoluteSupport(minSupport) => minSupport
private def countOptimalFrequency(data: Traversable[T]) = minSupport match {
case Support.Relative(minSupport) => math.ceil(data.size * minSupport).toInt
case Support.Absolute(minSupport) => minSupport
}

private def searchIntervals(data: Iterable[ValueFrequency[T]], optimalFrequency: Int) = {
private def searchIntervals(data: Traversable[ValueFrequency[T]], optimalFrequency: Int) = {
val intervals = new collection.mutable.ArrayBuffer[IntervalFrequency]()
for (value <- data.iterator) {
for (value <- data) {
intervals
.lastOption
.filter(interval => interval.frequency < optimalFrequency) match {
case Some(interval) => intervals.update(intervals.length - 1, IntervalFrequency(interval.interval.copy(maxValue = InclusiveIntervalBound(n.toDouble(value.value))), interval.frequency + value.frequency))
case Some(interval) => intervals.update(intervals.length - 1, IntervalFrequency(interval.interval.copy(maxValue = IntervalBound.Inclusive(n.toDouble(value.value))), interval.frequency + value.frequency))
case None =>
val leftRightBound = InclusiveIntervalBound(n.toDouble(value.value))
val leftRightBound = IntervalBound.Inclusive(n.toDouble(value.value))
intervals += IntervalFrequency(Interval(leftRightBound, leftRightBound), value.frequency)
}
}
Expand Down Expand Up @@ -52,14 +52,14 @@ class EquisizedIntervals[T] private[algorithm](minSupport: Support)(implicit val
decreasedIntervalFreqency >= optimalFrequency && nextDifference < currentDifference
}

def discretize(data: Iterable[T]): Seq[Interval] = data match {
def discretize(data: Traversable[T]): Array[Interval] = data match {
case data: SortedTraversable[T] =>
val optimalFrequency = countOptimalFrequency(data)
val intervals = searchIntervals(data, optimalFrequency)
smoothIntervals(intervals, data, 1000000)(canItMoveLeft(optimalFrequency))(canItMoveRight(optimalFrequency))
resolveCutpoints(intervals)
intervals.iterator.map(_.interval).toList
case _ => throw new IllegalTypeOfIterable(classOf[SortedTraversable[T]], data.getClass)
intervals.iterator.map(_.interval).toArray
case _ => throw new IllegalTypeOfTraversable(classOf[SortedTraversable[T]], data.getClass)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package eu.easyminer.discretization.algorithm
import java.util

import eu.easyminer.discretization.impl.sorting.SortedTraversable
import eu.easyminer.discretization.impl.{InclusiveIntervalBound, IntervalFrequency, ValueFrequency}
import eu.easyminer.discretization.impl.{IntervalBound, IntervalFrequency, ValueFrequency}
import eu.easyminer.discretization.util.NumericByteArray._

/**
Expand All @@ -17,15 +17,15 @@ trait IntervalSmoothing {
(implicit n: Numeric[T]): Unit = {
if (bufferSize < 32) throw new IllegalArgumentException("Buffer size for smoothing must be greater than 31 bytes.")
//input data are converted into ValueFrequency - it is aggregated distinct values with their count
val groupedData: Iterable[ValueFrequency[T]] = records
val groupedData: Traversable[ValueFrequency[T]] = records
//values buffer for faster smoothing iteration
val buffer = new util.LinkedList[ValueFrequency[T]]()
//miximal number of values in the buffer
val maxBufferSize = bufferSize / n.zero.length
//smooth until there are no interval changes
val iterates = Iterator.continually {
//within each smoothing iteration all sorted data are iterated
groupedData.iterator.foldLeft(0, false) { case ((pointer, isChanged), currentValue) =>
groupedData.foldLeft(0, false) { case ((pointer, isChanged), currentValue) =>
if (pointer < intervals.length - 1) {
//we have two intervals to compare
val leftInterval = intervals(pointer)
Expand All @@ -42,11 +42,11 @@ trait IntervalSmoothing {
pointer + 1
}
//this method moves right interval border into the left interval
def moveToLeft() = {
def moveToLeft(): Unit = {
//new left interval has right border as prevValue = add prev value into the left interval
intervals.update(pointer, IntervalFrequency(leftInterval.interval.copy(maxValue = rightInterval.interval.minValue), leftInterval.frequency + prevValue.get.frequency))
//new right interval has left border as currentValue = remove prev value from the right interval
intervals.update(pointer + 1, IntervalFrequency(rightInterval.interval.copy(minValue = InclusiveIntervalBound(n.toDouble(currentValue.value))), rightInterval.frequency - prevValue.get.frequency))
intervals.update(pointer + 1, IntervalFrequency(rightInterval.interval.copy(minValue = IntervalBound.Inclusive(n.toDouble(currentValue.value))), rightInterval.frequency - prevValue.get.frequency))
}
//this method moves left interval borders into the right interval
//it moves border from all items in the buffer until condition
Expand All @@ -59,7 +59,7 @@ trait IntervalSmoothing {
val currentValue = buffer.pollFirst()
val prevValue = buffer.getFirst
//new left interval has right border as prevValue = delete current from the left interval
val newLeftInterval = IntervalFrequency(leftInterval.interval.copy(maxValue = InclusiveIntervalBound(n.toDouble(prevValue.value))), leftInterval.frequency - currentValue.frequency)
val newLeftInterval = IntervalFrequency(leftInterval.interval.copy(maxValue = IntervalBound.Inclusive(n.toDouble(prevValue.value))), leftInterval.frequency - currentValue.frequency)
//new right interval has left border as currentValue = add current into the right interval
val newRightInterval = IntervalFrequency(rightInterval.interval.copy(minValue = leftInterval.interval.maxValue), rightInterval.frequency + currentValue.frequency)
//do it again
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import java.io.File
import eu.easyminer.discretization
import eu.easyminer.discretization.algorithm.{Discretization, EquidistantIntervals, EquifrequentIntervals, EquisizedIntervals}
import eu.easyminer.discretization.impl.IterableConversions._
import eu.easyminer.discretization.impl.sorting.{ReversableSortedTraversable, SortedInMemoryNumericTraversable, SortedPersistentNumericTraversable}
import eu.easyminer.discretization.impl.sorting.{SortedInMemoryNumericTraversable, SortedPersistentNumericTraversable}
import eu.easyminer.discretization.{Discretizable, DiscretizationTask}

import scala.language.implicitConversions
Expand All @@ -26,18 +26,18 @@ object DefaultDiscretization extends Discretizable {
implicit val c: java.util.Iterator[A] => Iterator[B] = javaIteratorToIterator[A, B]
val dt = Discretization(discretizationTask)
dt match {
case dt: EquidistantIntervals[B] => dt.discretize(data.asScala)
case dt: EquidistantIntervals[B] => dt.discretize(data.asScala).toArray
case _: EquifrequentIntervals[B] | _: EquisizedIntervals[B] => data match {
case data: discretization.SortedIterable[A] with discretization.PersistentIterable[A] =>
SortedPersistentNumericTraversable[B, Seq[Interval]](data, file)(dt.discretize)
SortedPersistentNumericTraversable[B, Traversable[Interval]](data, file)(dt.discretize).toArray
case data: discretization.InMemoryIterable[A] =>
dt.discretize(SortedInMemoryNumericTraversable(data.iterator(), discretizationTask.getBufferSize))
dt.discretize(SortedInMemoryNumericTraversable(data.asScala, discretizationTask.getBufferSize)).toArray
case data: discretization.ReversableSortedIterable[A] =>
dt.discretize(data: ReversableSortedTraversable[B])
dt.discretize(data.asScala).toArray
case data: discretization.SortedIterable[A] =>
SortedPersistentNumericTraversable[B, Seq[Interval]](data, file)(dt.discretize)
dt.discretize(data.asScala).toArray
case _ =>
SortedPersistentNumericTraversable[B, Seq[Interval]](data.iterator(), directory, discretizationTask.getBufferSize)(dt.discretize)
SortedPersistentNumericTraversable[B, Traversable[Interval]](data.asScala, directory, discretizationTask.getBufferSize)(dt.discretize).toArray
}
case _ => Array()
}
Expand Down
34 changes: 19 additions & 15 deletions src/main/scala/eu/easyminer/discretization/impl/Interval.scala
Original file line number Diff line number Diff line change
@@ -1,30 +1,34 @@
package eu.easyminer.discretization.impl

import eu.easyminer.discretization
import java.lang

import scala.language.implicitConversions
import eu.easyminer.discretization

/**
* Created by propan on 16. 3. 2017.
*/
case class Interval(minValue: IntervalBound, maxValue: IntervalBound)

object Interval {
case class Interval(minValue: IntervalBound, maxValue: IntervalBound) extends discretization.Interval {
def getLeftBoundValue: lang.Double = minValue.value

implicit def intervalToJavaInterval(interval: Interval): discretization.Interval = new discretization.Interval {
def getLeftBoundValue: java.lang.Double = interval.minValue.value
def getRightBoundValue: lang.Double = maxValue.value

def getRightBoundValue: java.lang.Double = interval.maxValue.value
def isLeftBoundOpened: lang.Boolean = minValue.isInstanceOf[IntervalBound.Exclusive]

def isLeftBoundClosed: java.lang.Boolean = interval.minValue.isInstanceOf[IntervalBound.Inclusive]
def isRightBoundOpened: lang.Boolean = maxValue.isInstanceOf[IntervalBound.Exclusive]

def isRightBoundClosed: java.lang.Boolean = interval.maxValue.isInstanceOf[IntervalBound.Inclusive]
def isLeftBoundClosed: lang.Boolean = !isLeftBoundOpened

def isLeftBoundOpened: java.lang.Boolean = !isLeftBoundClosed
def isRightBoundClosed: lang.Boolean = !isRightBoundOpened

def isRightBoundOpened: java.lang.Boolean = !isRightBoundClosed
def isInInterval(value: Double): lang.Boolean = {
val isGtMinValue = minValue match {
case IntervalBound.Inclusive(x) => value >= x
case IntervalBound.Exclusive(x) => value > x
}
val isLtMaxValue = maxValue match {
case IntervalBound.Inclusive(x) => value <= x
case IntervalBound.Exclusive(x) => value < x
}
isGtMinValue && isLtMaxValue
}

implicit def seqIntervalsToArrayJavaIntervals(intervals: Seq[Interval]): Array[discretization.Interval] = intervals.iterator.map(x => x: discretization.Interval).toArray

}
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ trait IterableConversions {

implicit class PimpedJavaIterable[A <: Number](it: java.lang.Iterable[A]) {

def asScala[B](implicit n: Numeric[B], numberToScalaNumber: A => B): Iterable[B] = {
def asScala[B](implicit n: Numeric[B], numberToScalaNumber: A => B): Traversable[B] = {
implicit val c: java.util.Iterator[A] => Iterator[B] = javaIteratorToIterator[A, B]
it match {
case it: ReversableSortedIterable[A] => it: sorting.ReversableSortedTraversable[B]
case it: SortedIterable[A] => it: sorting.SortedTraversable[B]
case _ => new Iterable[B] {
def iterator: Iterator[B] = it.iterator()
case _ => new Traversable[B] {
def foreach[U](f: B => U): Unit = it.iterator().foreach(f)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import eu.easyminer.discretization.util.PersistentTraversableOps._
* Created by propan on 17. 3. 2017.
*/
class PersistentNumericTraversable[T] private(col: Traversable[T], file: File)(implicit n: Numeric[T]) extends Traversable[T] {
//implicit private val b2n: Array[Byte] => T = byteArrayToNumber[T]
implicit private val b2n: Array[Byte] => T = byteArrayToNumber[T]
def foreach[U](f: T => U): Unit = if (file.exists()) inputStreamTraversable[T](new FileInputStream(file)).foreach(f) else outputStreamTraversable(col, new FileOutputStream(file))
}

Expand Down
Loading

0 comments on commit a7872ad

Please sign in to comment.