diff --git a/src/main/scala/com/fulcrumgenomics/util/Metric.scala b/src/main/scala/com/fulcrumgenomics/util/Metric.scala index 62b0961ce..4baae498a 100644 --- a/src/main/scala/com/fulcrumgenomics/util/Metric.scala +++ b/src/main/scala/com/fulcrumgenomics/util/Metric.scala @@ -25,24 +25,21 @@ package com.fulcrumgenomics.util -import com.fulcrumgenomics.cmdline.FgBioMain.FailureException -import com.fulcrumgenomics.commons.CommonsDef._ import com.fulcrumgenomics.commons.io.{Writer => CommonsWriter} import com.fulcrumgenomics.commons.reflect.{ReflectionUtil, ReflectiveBuilder} -import com.fulcrumgenomics.commons.util.{DelimitedDataParser, LazyLogging} +import com.fulcrumgenomics.commons.util.DelimitedDataParser import enumeratum.EnumEntry import htsjdk.samtools.util.Iso8601Date -import java.io.{PrintWriter, StringWriter, Writer} +import java.io.Writer import java.nio.file.Path import java.text.{DecimalFormat, NumberFormat, SimpleDateFormat} import java.util.Date import scala.collection.compat._ import scala.collection.concurrent.TrieMap import scala.reflect.runtime.{universe => ru} -import scala.util.{Failure, Success} -object Metric extends LazyLogging { +object Metric { val Delimiter: Char = '\t' val DelimiterAsString: String = s"$Delimiter" @@ -102,53 +99,14 @@ object Metric extends LazyLogging { /** Reads metrics from a set of lines. The first line should be the header with the field names. Each subsequent * line should be a single metric. */ def iterator[T <: Metric](lines: Iterator[String], source: Option[String] = None)(implicit tt: ru.TypeTag[T]): Iterator[T] = { - val clazz: Class[T] = ReflectionUtil.typeTagToClass[T] - - def fail(lineNumber: Int, - message: String, - throwable: Option[Throwable] = None): Unit = { - val sourceMessage = source.map("\nIn source: " + _).getOrElse("") - val fullMessage = s"On line #$lineNumber for metric '${clazz.getSimpleName}'$sourceMessage\n$message" - throwable.foreach { thr => - val stringWriter = new StringWriter - thr.printStackTrace(new PrintWriter(stringWriter)) - val banner = "#" * 80 - logger.debug(banner) - logger.debug(stringWriter.toString) - logger.debug(banner) - } - throw FailureException(message=Some(fullMessage)) - } - - if (lines.isEmpty) fail(lineNumber=1, message="No header found") - val parser = new DelimitedDataParser(lines=lines, delimiter=Delimiter, ignoreBlankLines=false, trimFields=true) - val names = parser.headers.toIndexedSeq - val reflectiveBuilder = new ReflectiveBuilder(clazz) + val builder = new MetricBuilder[T](source=source)(tt) + if (lines.isEmpty) builder.fail(message="No header found", lineNumber=Some(1)) + val parser = new DelimitedDataParser(lines=lines, delimiter=Delimiter, ignoreBlankLines=false, trimFields=true) + val names = parser.headers.toIndexedSeq parser.zipWithIndex.map { case (row, rowIndex) => - forloop(from = 0, until = names.length) { i => - reflectiveBuilder.argumentLookup.forField(names(i)) match { - case Some(arg) => - val value = { - val tmp = row[String](i) - if (tmp.isEmpty && arg.argumentType == classOf[Option[_]]) ReflectionUtil.SpecialEmptyOrNoneToken else tmp - } - - val argumentValue = ReflectionUtil.constructFromString(arg.argumentType, arg.unitType, value) match { - case Success(v) => v - case Failure(thr) => - fail(lineNumber=rowIndex+2, message=s"Could not construct value for column '${arg.name}' of type '${arg.typeDescription}' from '$value'", Some(thr)) - } - arg.value = argumentValue - case None => - fail(lineNumber=rowIndex+2, message=s"Did not have a field with name '${names(i)}'.") - } - } - - // build it. NB: if arguments are missing values, then an exception will be thrown here - // Also, we don't use the default "build()" method since if a collection or option is empty, it will be treated as - // missing. - reflectiveBuilder.build(reflectiveBuilder.argumentLookup.ordered.map(arg => arg.value getOrElse unreachable(s"Arguments not set: ${arg.name}"))) + val argMap = names.zipWithIndex.map { case (name, i) => name -> row[String](i) }.toMap + builder.fromArgMap(argMap=argMap, lineNumber=Some(rowIndex+2)) } } diff --git a/src/main/scala/com/fulcrumgenomics/util/MetricBuilder.scala b/src/main/scala/com/fulcrumgenomics/util/MetricBuilder.scala new file mode 100644 index 000000000..2b12c139e --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/util/MetricBuilder.scala @@ -0,0 +1,142 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.util + +import com.fulcrumgenomics.cmdline.FgBioMain.FailureException +import com.fulcrumgenomics.commons.CommonsDef.{forloop, unreachable} +import com.fulcrumgenomics.commons.reflect.{ReflectionUtil, ReflectiveBuilder} +import com.fulcrumgenomics.commons.util.LazyLogging + +import java.io.{PrintWriter, StringWriter} +import scala.reflect.runtime.{universe => ru} +import scala.util.{Failure, Success} + +/** Class for building metrics of type [[T]]. + * + * This is not thread-safe. + * + * @param source optionally, the source of reading (e.g. file) + * @tparam T the metric type + */ +class MetricBuilder[T <: Metric](source: Option[String] = None)(implicit tt: ru.TypeTag[T]) extends LazyLogging { + // The main reason why a builder is necessary is to cache some expensive reflective calls. + private val clazz: Class[T] = ReflectionUtil.typeTagToClass[T] + private val reflectiveBuilder = new ReflectiveBuilder(clazz) + private val names = Metric.names[T] + + /** Builds a metric from a delimited line + * + * @param line the line with delimited values + * @param delim the delimiter of the values + * @param lineNumber optionally, the line number when building a metric from a line in a file + * @return + */ + def fromLine(line: String, delim: String = Metric.DelimiterAsString, lineNumber: Option[Int] = None): T = { + fromValues(values = line.split(delim), lineNumber = lineNumber) + } + + /** Builds a metric from values for the complete set of metric fields + * + * @param values the values in the same order as the names defined in the class + * @param lineNumber optionally, the line number when building a metric from a line in a file + * @return + */ + def fromValues(values: Iterable[String], lineNumber: Option[Int] = None): T = { + val vals = values.toIndexedSeq + if (names.length != vals.length) { + fail(message = f"Failed decoding: expected '${names.length}' fields, found '${vals.length}'.", lineNumber = lineNumber) + } + fromArgMap(argMap = names.zip(values).toMap, lineNumber = lineNumber) + } + + /** Builds a metric of type [[T]] + * + * @param argMap map of field names to values. All required fields must be given. Can be in any order. + * @param lineNumber optionally, the line number when building a metric from a line in a file + * @return a new instance of type [[T]] + */ + def fromArgMap(argMap: Map[String, String], lineNumber: Option[Int] = None): T = { + reflectiveBuilder.reset() // reset the arguments to their initial values + + val names = argMap.keys.toIndexedSeq + forloop(from = 0, until = names.length) { i => + reflectiveBuilder.argumentLookup.forField(names(i)) match { + case Some(arg) => + val value = { + val tmp = argMap(names(i)) + if (tmp.isEmpty && arg.argumentType == classOf[Option[_]]) ReflectionUtil.SpecialEmptyOrNoneToken else tmp + } + + val argumentValue = ReflectionUtil.constructFromString(arg.argumentType, arg.unitType, value) match { + case Success(v) => v + case Failure(thr) => + fail( + message = s"Could not construct value for column '${arg.name}' of type '${arg.typeDescription}' from '$value'", + throwable = Some(thr), + lineNumber = lineNumber + ) + } + arg.value = argumentValue + case None => + fail( + message = s"Did not have a field with name '${names(i)}'.", + lineNumber = lineNumber + ) + } + } + + // build it. NB: if arguments are missing values, then an exception will be thrown here + // Also, we don't use the default "build()" method since if a collection or option is empty, it will be treated as + // missing. + val params = reflectiveBuilder.argumentLookup.ordered.map(arg => arg.value getOrElse unreachable(s"Arguments not set: ${arg.name}")) + reflectiveBuilder.build(params) + } + + /** Logs the throwable, if given, and throws a [[FailureException]] with information about when reading metrics fails + * + * @param message the message to include in the exception thrown + * @param throwable optionally, a throwable that should be logged + * @param lineNumber optionally, the line number when building a metric from a line in a file + */ + def fail(message: String, throwable: Option[Throwable] = None, lineNumber: Option[Int] = None): Unit = { + throwable.foreach { thr => + val stringWriter = new StringWriter + thr.printStackTrace(new PrintWriter(stringWriter)) + val banner = "#" * 80 + logger.debug(banner) + logger.debug(stringWriter.toString) + logger.debug(banner) + } + val sourceMessage = source.map("\nIn source: " + _).getOrElse("") + val prefix = lineNumber match { + case None => "For metric" + case Some(n) => s"On line #$n for metric" + } + val fullMessage = s"$prefix '${clazz.getSimpleName}'$sourceMessage\n$message" + + throw FailureException(message = Some(fullMessage)) + } +} diff --git a/src/main/scala/com/fulcrumgenomics/util/MetricSorter.scala b/src/main/scala/com/fulcrumgenomics/util/MetricSorter.scala new file mode 100644 index 000000000..a1632a230 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/util/MetricSorter.scala @@ -0,0 +1,70 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.util + +import com.fulcrumgenomics.commons.CommonsDef.DirPath + +import scala.reflect.runtime.{universe => ru} + +/** Disk-backed metrics sorter + * + * @param maxObjectsInRam the maximum number of metrics to keep in memory before spilling to disk + * @param keyfunc method to convert a metric to an ordered key + * @param tmpDir the temporary directory in which to spill to disk + * @param tt the type tag for [[T]] + * @tparam Key the key to use for sorting metrics + * @tparam T the metric type + */ +class MetricSorter[Key <: Ordered[Key], T <: Metric](maxObjectsInRam: Int = MetricSorter.MaxInMemory, + keyfunc: T => Key, + tmpDir: DirPath = Io.tmpDir, + + )(implicit tt: ru.TypeTag[T]) extends Sorter[T, Key]( + maxObjectsInRam = maxObjectsInRam, + codec = new MetricSorter.MetricSorterCodec[T](), + keyfunc = keyfunc, + tmpDir = tmpDir +) + +object MetricSorter { + /** The default maximum # of records to keep and sort in memory. */ + val MaxInMemory: Int = 1e6.toInt + + /** The codec for encoding and decoding a metric */ + class MetricSorterCodec[T <: Metric]()(implicit tt: ru.TypeTag[T]) + extends Sorter.Codec[T] { + private val builder = new MetricBuilder[T]() + + /** Encode the metric into an array of bytes. */ + def encode(metric: T): Array[Byte] = metric.values.mkString(Metric.DelimiterAsString).getBytes + + /** Decode a metric from an array of bytes. */ + def decode(bs: Array[Byte], start: Int, length: Int): T = { + val fields = new String(bs.slice(from = start, until = start + length)).split(Metric.DelimiterAsString) + builder.fromValues(fields) + } + } +} \ No newline at end of file diff --git a/src/test/scala/com/fulcrumgenomics/util/MetricBuilderTest.scala b/src/test/scala/com/fulcrumgenomics/util/MetricBuilderTest.scala new file mode 100644 index 000000000..4c7dc21bd --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/util/MetricBuilderTest.scala @@ -0,0 +1,43 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.util + +import com.fulcrumgenomics.testing.UnitSpec + + +case class MetricBuilderTestMetric(name: String, count: Long = 1) extends Metric + +class MetricBuilderTest extends UnitSpec { + private val builder = new MetricBuilder[MetricBuilderTestMetric]() + + "MetricBuilder.fromArgMap" should "build a metric from an argmap with all value specified" in { + builder.fromArgMap(Map("name" -> "foo", "count" -> "2")) shouldBe MetricBuilderTestMetric(name="foo", count=2) + } + + it should "build a metric from an argmap with only required values specified" in { + builder.fromArgMap(Map("name" -> "foo")) shouldBe MetricBuilderTestMetric(name="foo") + } +} diff --git a/src/test/scala/com/fulcrumgenomics/util/MetricSorterTest.scala b/src/test/scala/com/fulcrumgenomics/util/MetricSorterTest.scala new file mode 100644 index 000000000..02c0ac8a1 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/util/MetricSorterTest.scala @@ -0,0 +1,70 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.util + +import com.fulcrumgenomics.testing.UnitSpec + +case class MetricSorterTestMetric(name: String, count: Long) extends Metric with Ordered[MetricSorterTestMetric] { + override def compare(that: MetricSorterTestMetric): Int = { + var retval = this.count.compare(that.count) + if (retval == 0) retval = this.name.compare(that.name) + retval + } +} + +class MetricSorterTest extends UnitSpec { + + private val metrics = IndexedSeq( + MetricSorterTestMetric(name="foo", count=10), + MetricSorterTestMetric(name="foo", count=1), + MetricSorterTestMetric(name="bar", count=1), + MetricSorterTestMetric(name="foo", count=5), + MetricSorterTestMetric(name="roger", count=2), + MetricSorterTestMetric(name="nadal", count=2), + ) + + private val metricsSorted = metrics.sortBy(m => (m.count, m.name)) + + private case class Key() + + "MetricSorter" should "sort metrics in memory" in { + val sorter = new MetricSorter[MetricSorterTestMetric, MetricSorterTestMetric]( + maxObjectsInRam = 10000, + keyfunc = identity + ) + sorter ++= metrics + sorter.iterator.toSeq should contain theSameElementsInOrderAs metricsSorted + } + + it should "sort metrics after spilling to disk" in { + val sorter = new MetricSorter[MetricSorterTestMetric, MetricSorterTestMetric]( + maxObjectsInRam = 2, + keyfunc = identity + ) + sorter ++= metrics + sorter.iterator.toSeq should contain theSameElementsInOrderAs metricsSorted + } +}