diff --git a/CHANGELOG.md b/CHANGELOG.md index a7ba52ed68e7..4aaee0784caf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -190,6 +190,7 @@ - [Added various date part functions to `Date` and `Date_Time`.][3669] - [Implemented `Table.take` and `Table.drop` for the in-memory backend.][3647] - [Implemented specialized storage for the in-memory Table.][3673] +- [Implemented `Table.distinct` for the in-memory backend.][3684] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -303,6 +304,7 @@ [3669]: https://github.com/enso-org/enso/pull/3669 [3647]: https://github.com/enso-org/enso/pull/3647 [3673]: https://github.com/enso-org/enso/pull/3673 +[3684]: https://github.com/enso-org/enso/pull/3684 #### Enso Compiler diff --git a/build.sbt b/build.sbt index d2d3d4b142bb..f5a89af8e4d4 100644 --- a/build.sbt +++ b/build.sbt @@ -1767,7 +1767,7 @@ lazy val `std-table` = project Compile / packageBin / artifactPath := `table-polyglot-root` / "std-table.jar", libraryDependencies ++= Seq( - "com.ibm.icu" % "icu4j" % icuVersion, + "com.ibm.icu" % "icu4j" % icuVersion % "provided", "com.univocity" % "univocity-parsers" % "2.9.1", "org.apache.poi" % "poi-ooxml" % "5.2.2", "org.apache.xmlbeans" % "xmlbeans" % "5.1.0", @@ -1786,6 +1786,7 @@ lazy val `std-table` = project result }.value ) + .dependsOn(`std-base` % "provided") lazy val `std-image` = project .in(file("std-bits") / "image") diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Case.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Case.enso index 1e2360090e94..4711df320e48 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Case.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Case.enso @@ -1,3 +1,7 @@ +from Standard.Base import all + +polyglot java import org.enso.base.text.TextFoldingStrategy + ## Specifies the casing options for text conversion. type Case ## All letters in lower case. @@ -8,3 +12,19 @@ type Case ## First letter of each word in upper case, rest in lower case. Title + +## Represents case-insensitive comparison mode. + + Arguments: + - locale: The locale used for the comparison. +type Case_Insensitive + Case_Insensitive_Data locale=Locale.default + +## PRIVATE + Creates a Java `TextFoldingStrategy` from the case sensitivity setting. +folding_strategy : (True|Case_Insensitive) -> TextFoldingStrategy +folding_strategy case_sensitive = case case_sensitive of + True -> TextFoldingStrategy.unicodeNormalizedFold + Case_Insensitive_Data locale -> + TextFoldingStrategy.caseInsensitiveFold locale.java_locale + diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching.enso index f7d2ab82edd7..1727eae51a8a 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching.enso @@ -1,5 +1,6 @@ from Standard.Base import all +from Standard.Base.Data.Text.Case import Case_Insensitive, Case_Insensitive_Data from Standard.Base.Error.Problem_Behavior import Report_Warning from Standard.Base.Error.Common import Wrapped_Dataflow_Error_Data @@ -13,13 +14,6 @@ No_Matches_Found.to_display_text self = "The criteria "+self.criteria.to_text+" did not match any names in the input." -## Represents case-insensitive comparison mode. - - Arguments: - - locale: The locale used for the comparison. -type Case_Insensitive - Case_Insensitive_Data locale=Locale.default - ## Represents exact text matching mode. Arguments: diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso index 9c2abd9494ab..4d16e16e6473 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso @@ -19,6 +19,7 @@ import project.Data.Regression import project.Data.Statistics import project.Data.Statistics.Rank_Method import project.Data.Text +import project.Data.Text.Case import project.Data.Text.Encoding import project.Data.Text.Extensions import project.Data.Text.Matching @@ -97,7 +98,8 @@ from project.Data.Range export all https://www.pivotaltracker.com/story/show/181403340 https://www.pivotaltracker.com/story/show/181309938 from project.Data.Text.Extensions export Text, Line_Ending_Style, Case, Location, Matching_Mode -from project.Data.Text.Matching export Case_Insensitive_Data, Text_Matcher_Data, Regex_Matcher_Data, No_Matches_Found_Data +from project.Data.Text.Matching export Text_Matcher_Data, Regex_Matcher_Data, No_Matches_Found_Data +from project.Data.Text.Case export Case_Insensitive_Data, Text_Matcher_Data, Regex_Matcher_Data, No_Matches_Found_Data from project.Data.Text export all hiding Encoding, Span, Text_Ordering from project.Data.Text.Encoding export Encoding, Encoding_Error, Encoding_Error_Data from project.Data.Text.Text_Ordering export all diff --git a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso index 936ee1be574a..b31b94f2f9f1 100644 --- a/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Database/0.0.0-dev/src/Data/Table.enso @@ -19,7 +19,7 @@ import Standard.Table.Internal.Aggregate_Column_Helper from Standard.Database.Data.Column import Column, Aggregate_Column_Builder, Column_Data from Standard.Database.Data.Internal.IR import Internal_Column, Internal_Column_Data from Standard.Table.Errors import No_Such_Column_Error, No_Such_Column_Error_Data -from Standard.Table.Data.Column_Selector import Column_Selector, By_Index +from Standard.Table.Data.Column_Selector import Column_Selector, By_Index, By_Name from Standard.Table.Data.Data_Formatter import Data_Formatter from Standard.Database.Error import Unsupported_Database_Operation_Error_Data import Standard.Table.Data.Column_Name_Mapping @@ -547,6 +547,38 @@ type Table new_ctx = self.context.add_orders new_order_descriptors self.updated_context new_ctx + ## Returns the distinct set of rows within the specified columns from the + input table. + + When multiple rows have the same values within the specified columns, the + first row of each such set is returned. + + For the in-memory table, the unique rows will be in the order they + occurred in the input (this is not guaranteed for database operations). + + Arguments: + - columns: The columns of the table to use for distinguishing the rows. + - case_sensitive: Specifies if the text values should be compared case + sensitively. + - on_problems: Specifies how to handle if a problem occurs, raising as a + warning by default. + + The following problems can occur: + - If a column in columns is not in the input table, a + `Missing_Input_Columns`. + - If duplicate columns, names or indices are provided, a + `Duplicate_Column_Selectors`. + - If a column index is out of range, a `Column_Indexes_Out_Of_Range`. + - If two distinct indices refer to the same column, an + `Input_Indices_Already_Matched`. + - If no valid columns are selected, a `No_Input_Columns_Selected`. + - If floating points values are present in the distinct columns, a + `Floating_Point_Grouping` warning. + distinct : Column_Selector -> (True|Case_Insensitive) -> Problem_Behavior -> Table + distinct self (columns = By_Name (self.columns.map .name)) case_sensitive=True on_problems=Report_Warning = + _ = [columns, case_sensitive, on_problems] + Error.throw (Unsupported_Database_Operation_Error_Data "`Table.distinct` is not yet implemented for the database backend.") + ## UNSTABLE Efficiently joins two tables based on either the index or a key column. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/NOTICE b/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/NOTICE index 06970576c496..7d3f410db397 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/NOTICE +++ b/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/NOTICE @@ -6,11 +6,6 @@ The license file can be found at `licenses/BSD-3-Clause`. Copyright notices related to this dependency can be found in the directory `com.github.virtuald.curvesapi-1.07`. -'icu4j', licensed under the Unicode/ICU License, is distributed with the Table. -The license information can be found along with the copyright notices. -Copyright notices related to this dependency can be found in the directory `com.ibm.icu.icu4j-71.1`. - - 'univocity-parsers', licensed under the Apache 2, is distributed with the Table. The license file can be found at `licenses/APACHE2.0`. Copyright notices related to this dependency can be found in the directory `com.univocity.univocity-parsers-2.9.1`. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/com.ibm.icu.icu4j-71.1/LICENSE b/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/com.ibm.icu.icu4j-71.1/LICENSE deleted file mode 100644 index 80b587723a67..000000000000 --- a/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/com.ibm.icu.icu4j-71.1/LICENSE +++ /dev/null @@ -1,519 +0,0 @@ -UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE - -See Terms of Use -for definitions of Unicode Inc.’s Data Files and Software. - -NOTICE TO USER: Carefully read the following legal agreement. -BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S -DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), -YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE -TERMS AND CONDITIONS OF THIS AGREEMENT. -IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE -THE DATA FILES OR SOFTWARE. - -COPYRIGHT AND PERMISSION NOTICE - -Copyright © 1991-2022 Unicode, Inc. All rights reserved. -Distributed under the Terms of Use in https://www.unicode.org/copyright.html. - -Permission is hereby granted, free of charge, to any person obtaining -a copy of the Unicode data files and any associated documentation -(the "Data Files") or Unicode software and any associated documentation -(the "Software") to deal in the Data Files or Software -without restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, and/or sell copies of -the Data Files or Software, and to permit persons to whom the Data Files -or Software are furnished to do so, provided that either -(a) this copyright and permission notice appear with all copies -of the Data Files or Software, or -(b) this copyright and permission notice appear in associated -Documentation. - -THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF -ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT OF THIRD PARTY RIGHTS. -IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS -NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL -DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, -DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER -TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -PERFORMANCE OF THE DATA FILES OR SOFTWARE. - -Except as contained in this notice, the name of a copyright holder -shall not be used in advertising or otherwise to promote the sale, -use or other dealings in these Data Files or Software without prior -written authorization of the copyright holder. - ----------------------------------------------------------------------- - -Third-Party Software Licenses - -This section contains third-party software notices and/or additional -terms for licensed third-party software components included within ICU -libraries. - ----------------------------------------------------------------------- - -ICU License - ICU 1.8.1 to ICU 57.1 - -COPYRIGHT AND PERMISSION NOTICE - -Copyright (c) 1995-2016 International Business Machines Corporation and others -All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, and/or sell copies of the Software, and to permit persons -to whom the Software is furnished to do so, provided that the above -copyright notice(s) and this permission notice appear in all copies of -the Software and that both the above copyright notice(s) and this -permission notice appear in supporting documentation. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT -OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY -SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER -RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF -CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -Except as contained in this notice, the name of a copyright holder -shall not be used in advertising or otherwise to promote the sale, use -or other dealings in this Software without prior written authorization -of the copyright holder. - -All trademarks and registered trademarks mentioned herein are the -property of their respective owners. - ----------------------------------------------------------------------- - -Chinese/Japanese Word Break Dictionary Data (cjdict.txt) - - # The Google Chrome software developed by Google is licensed under - # the BSD license. Other software included in this distribution is - # provided under other licenses, as set forth below. - # - # The BSD License - # http://opensource.org/licenses/bsd-license.php - # Copyright (C) 2006-2008, Google Inc. - # - # All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions are met: - # - # Redistributions of source code must retain the above copyright notice, - # this list of conditions and the following disclaimer. - # Redistributions in binary form must reproduce the above - # copyright notice, this list of conditions and the following - # disclaimer in the documentation and/or other materials provided with - # the distribution. - # Neither the name of Google Inc. nor the names of its - # contributors may be used to endorse or promote products derived from - # this software without specific prior written permission. - # - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND - # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, - # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - # - # - # The word list in cjdict.txt are generated by combining three word lists - # listed below with further processing for compound word breaking. The - # frequency is generated with an iterative training against Google web - # corpora. - # - # * Libtabe (Chinese) - # - https://sourceforge.net/project/?group_id=1519 - # - Its license terms and conditions are shown below. - # - # * IPADIC (Japanese) - # - http://chasen.aist-nara.ac.jp/chasen/distribution.html - # - Its license terms and conditions are shown below. - # - # ---------COPYING.libtabe ---- BEGIN-------------------- - # - # /* - # * Copyright (c) 1999 TaBE Project. - # * Copyright (c) 1999 Pai-Hsiang Hsiao. - # * All rights reserved. - # * - # * Redistribution and use in source and binary forms, with or without - # * modification, are permitted provided that the following conditions - # * are met: - # * - # * . Redistributions of source code must retain the above copyright - # * notice, this list of conditions and the following disclaimer. - # * . Redistributions in binary form must reproduce the above copyright - # * notice, this list of conditions and the following disclaimer in - # * the documentation and/or other materials provided with the - # * distribution. - # * . Neither the name of the TaBE Project nor the names of its - # * contributors may be used to endorse or promote products derived - # * from this software without specific prior written permission. - # * - # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - # * OF THE POSSIBILITY OF SUCH DAMAGE. - # */ - # - # /* - # * Copyright (c) 1999 Computer Systems and Communication Lab, - # * Institute of Information Science, Academia - # * Sinica. All rights reserved. - # * - # * Redistribution and use in source and binary forms, with or without - # * modification, are permitted provided that the following conditions - # * are met: - # * - # * . Redistributions of source code must retain the above copyright - # * notice, this list of conditions and the following disclaimer. - # * . Redistributions in binary form must reproduce the above copyright - # * notice, this list of conditions and the following disclaimer in - # * the documentation and/or other materials provided with the - # * distribution. - # * . Neither the name of the Computer Systems and Communication Lab - # * nor the names of its contributors may be used to endorse or - # * promote products derived from this software without specific - # * prior written permission. - # * - # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - # * OF THE POSSIBILITY OF SUCH DAMAGE. - # */ - # - # Copyright 1996 Chih-Hao Tsai @ Beckman Institute, - # University of Illinois - # c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4 - # - # ---------------COPYING.libtabe-----END-------------------------------- - # - # - # ---------------COPYING.ipadic-----BEGIN------------------------------- - # - # Copyright 2000, 2001, 2002, 2003 Nara Institute of Science - # and Technology. All Rights Reserved. - # - # Use, reproduction, and distribution of this software is permitted. - # Any copy of this software, whether in its original form or modified, - # must include both the above copyright notice and the following - # paragraphs. - # - # Nara Institute of Science and Technology (NAIST), - # the copyright holders, disclaims all warranties with regard to this - # software, including all implied warranties of merchantability and - # fitness, in no event shall NAIST be liable for - # any special, indirect or consequential damages or any damages - # whatsoever resulting from loss of use, data or profits, whether in an - # action of contract, negligence or other tortuous action, arising out - # of or in connection with the use or performance of this software. - # - # A large portion of the dictionary entries - # originate from ICOT Free Software. The following conditions for ICOT - # Free Software applies to the current dictionary as well. - # - # Each User may also freely distribute the Program, whether in its - # original form or modified, to any third party or parties, PROVIDED - # that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear - # on, or be attached to, the Program, which is distributed substantially - # in the same form as set out herein and that such intended - # distribution, if actually made, will neither violate or otherwise - # contravene any of the laws and regulations of the countries having - # jurisdiction over the User or the intended distribution itself. - # - # NO WARRANTY - # - # The program was produced on an experimental basis in the course of the - # research and development conducted during the project and is provided - # to users as so produced on an experimental basis. Accordingly, the - # program is provided without any warranty whatsoever, whether express, - # implied, statutory or otherwise. The term "warranty" used herein - # includes, but is not limited to, any warranty of the quality, - # performance, merchantability and fitness for a particular purpose of - # the program and the nonexistence of any infringement or violation of - # any right of any third party. - # - # Each user of the program will agree and understand, and be deemed to - # have agreed and understood, that there is no warranty whatsoever for - # the program and, accordingly, the entire risk arising from or - # otherwise connected with the program is assumed by the user. - # - # Therefore, neither ICOT, the copyright holder, or any other - # organization that participated in or was otherwise related to the - # development of the program and their respective officials, directors, - # officers and other employees shall be held liable for any and all - # damages, including, without limitation, general, special, incidental - # and consequential damages, arising out of or otherwise in connection - # with the use or inability to use the program or any product, material - # or result produced or otherwise obtained by using the program, - # regardless of whether they have been advised of, or otherwise had - # knowledge of, the possibility of such damages at any time during the - # project or thereafter. Each user will be deemed to have agreed to the - # foregoing by his or her commencement of use of the program. The term - # "use" as used herein includes, but is not limited to, the use, - # modification, copying and distribution of the program and the - # production of secondary products from the program. - # - # In the case where the program, whether in its original form or - # modified, was distributed or delivered to or received by a user from - # any person, organization or entity other than ICOT, unless it makes or - # grants independently of ICOT any specific warranty to the user in - # writing, such person, organization or entity, will also be exempted - # from and not be held liable to the user for any such damages as noted - # above as far as the program is concerned. - # - # ---------------COPYING.ipadic-----END---------------------------------- - ----------------------------------------------------------------------- - -Lao Word Break Dictionary Data (laodict.txt) - - # Copyright (C) 2016 and later: Unicode, Inc. and others. - # License & terms of use: http://www.unicode.org/copyright.html - # Copyright (c) 2015 International Business Machines Corporation - # and others. All Rights Reserved. - # - # Project: https://github.com/rober42539/lao-dictionary - # Dictionary: https://github.com/rober42539/lao-dictionary/laodict.txt - # License: https://github.com/rober42539/lao-dictionary/LICENSE.txt - # (copied below) - # - # This file is derived from the above dictionary version of Nov 22, 2020 - # ---------------------------------------------------------------------- - # Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell. - # All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions are met: - # - # Redistributions of source code must retain the above copyright notice, this - # list of conditions and the following disclaimer. Redistributions in binary - # form must reproduce the above copyright notice, this list of conditions and - # the following disclaimer in the documentation and/or other materials - # provided with the distribution. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - # OF THE POSSIBILITY OF SUCH DAMAGE. - # -------------------------------------------------------------------------- - ----------------------------------------------------------------------- - -Burmese Word Break Dictionary Data (burmesedict.txt) - - # Copyright (c) 2014 International Business Machines Corporation - # and others. All Rights Reserved. - # - # This list is part of a project hosted at: - # github.com/kanyawtech/myanmar-karen-word-lists - # - # -------------------------------------------------------------------------- - # Copyright (c) 2013, LeRoy Benjamin Sharon - # All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: Redistributions of source code must retain the above - # copyright notice, this list of conditions and the following - # disclaimer. Redistributions in binary form must reproduce the - # above copyright notice, this list of conditions and the following - # disclaimer in the documentation and/or other materials provided - # with the distribution. - # - # Neither the name Myanmar Karen Word Lists, nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND - # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, - # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS - # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR - # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF - # THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - # SUCH DAMAGE. - # -------------------------------------------------------------------------- - ----------------------------------------------------------------------- - -Time Zone Database - - ICU uses the public domain data and code derived from Time Zone -Database for its time zone support. The ownership of the TZ database -is explained in BCP 175: Procedure for Maintaining the Time Zone -Database section 7. - - # 7. Database Ownership - # - # The TZ database itself is not an IETF Contribution or an IETF - # document. Rather it is a pre-existing and regularly updated work - # that is in the public domain, and is intended to remain in the - # public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do - # not apply to the TZ Database or contributions that individuals make - # to it. Should any claims be made and substantiated against the TZ - # Database, the organization that is providing the IANA - # Considerations defined in this RFC, under the memorandum of - # understanding with the IETF, currently ICANN, may act in accordance - # with all competent court orders. No ownership claims will be made - # by ICANN or the IETF Trust on the database or the code. Any person - # making a contribution to the database or code waives all rights to - # future claims in that contribution or in the TZ Database. - ----------------------------------------------------------------------- - -Google double-conversion - -Copyright 2006-2011, the V8 project authors. All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of Google Inc. nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - ----------------------------------------------------------------------- - -File: aclocal.m4 (only for ICU4C) -Section: pkg.m4 - Macros to locate and utilise pkg-config. - - -Copyright © 2004 Scott James Remnant . -Copyright © 2012-2015 Dan Nicholson - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -02111-1307, USA. - -As a special exception to the GNU General Public License, if you -distribute this file as part of a program that contains a -configuration script generated by Autoconf, you may include it under -the same distribution terms that you use for the rest of that -program. - - -(The condition for the exception is fulfilled because -ICU4C includes a configuration script generated by Autoconf, -namely the `configure` script.) - ----------------------------------------------------------------------- - -File: config.guess (only for ICU4C) - - -This file is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, see . - -As a special exception to the GNU General Public License, if you -distribute this file as part of a program that contains a -configuration script generated by Autoconf, you may include it under -the same distribution terms that you use for the rest of that -program. This Exception is an additional permission under section 7 -of the GNU General Public License, version 3 ("GPLv3"). - - -(The condition for the exception is fulfilled because -ICU4C includes a configuration script generated by Autoconf, -namely the `configure` script.) - ----------------------------------------------------------------------- - -File: install-sh (only for ICU4C) - - -Copyright 1991 by the Massachusetts Institute of Technology - -Permission to use, copy, modify, distribute, and sell this software and its -documentation for any purpose is hereby granted without fee, provided that -the above copyright notice appear in all copies and that both that -copyright notice and this permission notice appear in supporting -documentation, and that the name of M.I.T. not be used in advertising or -publicity pertaining to distribution of the software without specific, -written prior permission. M.I.T. makes no representations about the -suitability of this software for any purpose. It is provided "as is" -without express or implied warranty. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/com.ibm.icu.icu4j-71.1/NOTICES b/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/com.ibm.icu.icu4j-71.1/NOTICES deleted file mode 100644 index 0035166ac5c5..000000000000 --- a/distribution/lib/Standard/Table/0.0.0-dev/THIRD-PARTY/com.ibm.icu.icu4j-71.1/NOTICES +++ /dev/null @@ -1,3 +0,0 @@ -Copyright (C) 1996-2004, International Business Machines Corporation and * - -Copyright (C) 2001-2014, International Business Machines diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index 5547fe1f96ab..9a53eb5a4fec 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -3,6 +3,7 @@ import Standard.Base.Error.Common as Errors from Standard.Base.Error.Problem_Behavior import Report_Warning import Standard.Base.Data.Index_Sub_Range import Standard.Base.Data.Ordering.Comparator +import Standard.Base.Data.Text.Case import Standard.Base.System.Platform import Standard.Table.Data.Column @@ -14,10 +15,10 @@ import Standard.Table.Internal.Delimited_Reader import Standard.Table.Internal.Delimited_Writer import Standard.Table.Internal.Problem_Builder -from Standard.Table.Data.Column_Selector import Column_Selector, By_Index +from Standard.Table.Data.Column_Selector import Column_Selector, By_Index, By_Name from Standard.Table.Data.Column_Type_Selection import Column_Type_Selection, Auto from Standard.Table.Data.Data_Formatter import Data_Formatter, Data_Formatter_Data -from Standard.Table.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector, No_Index_Set_Error, No_Such_Column_Error, No_Such_Column_Error_Data +from Standard.Table.Errors import Missing_Input_Columns, Column_Indexes_Out_Of_Range, Duplicate_Type_Selector, No_Index_Set_Error, No_Such_Column_Error, No_Such_Column_Error_Data, No_Input_Columns_Selected, No_Output_Columns import Standard.Table.Data.Match_Columns import Standard.Table.Data.Column_Name_Mapping @@ -541,7 +542,7 @@ type Table aggregate self columns (on_problems=Report_Warning) = validated = Aggregate_Column_Helper.prepare_aggregate_columns columns self - on_problems.attach_problems_before validated.problems <| + on_problems.attach_problems_before validated.problems <| Illegal_Argument_Error.handle_java_exception <| java_key_columns = validated.key_columns.map .java_column index = self.java_table.indexFromColumns java_key_columns.to_array Comparator.new @@ -650,9 +651,53 @@ type Table selected_columns = columns_for_ordering.map c->c.column.java_column ordering = columns_for_ordering.map c->c.associated_selector.direction.to_sign comparator = Comparator.for_text_ordering text_ordering - java_table = self.java_table.orderBy selected_columns.to_array ordering.to_array comparator + java_table = Illegal_Argument_Error.handle_java_exception <| + self.java_table.orderBy selected_columns.to_array ordering.to_array comparator Table_Data java_table + ## Returns the distinct set of rows within the specified columns from the + input table. + + When multiple rows have the same values within the specified columns, the + first row of each such set is returned. + + For the in-memory table, the unique rows will be in the order they + occurred in the input (this is not guaranteed for database operations). + + Arguments: + - columns: The columns of the table to use for distinguishing the rows. + - case_sensitive: Specifies if the text values should be compared case + sensitively. + - on_problems: Specifies how to handle if a problem occurs, raising as a + warning by default. + + The following problems can occur: + - If a column in columns is not in the input table, a + `Missing_Input_Columns`. + - If duplicate columns, names or indices are provided, a + `Duplicate_Column_Selectors`. + - If a column index is out of range, a `Column_Indexes_Out_Of_Range`. + - If two distinct indices refer to the same column, an + `Input_Indices_Already_Matched`. + - If no valid columns are selected, a `No_Input_Columns_Selected`. + - If floating points values are present in the distinct columns, a + `Floating_Point_Grouping` warning. + distinct : Column_Selector -> (True|Case_Insensitive) -> Problem_Behavior -> Table + distinct self (columns = By_Name (self.columns.map .name)) case_sensitive=True on_problems=Report_Warning = + warning_mapper error = case error of + No_Output_Columns -> Maybe.Some No_Input_Columns_Selected + _ -> Nothing + key_columns = Warning.map_warnings_and_errors warning_mapper <| + Table_Helpers.select_columns internal_columns=self.columns selector=columns reorder=True on_problems=on_problems + java_columns = key_columns.map .java_column + text_folding_strategy = Case.folding_strategy case_sensitive + java_table = Illegal_Argument_Error.handle_java_exception <| + self.java_table.distinct java_columns.to_array text_folding_strategy + on_problems.attach_problems_after (Table_Data java_table) <| + problems = java_table.getProblems + Aggregate_Column_Helper.parse_aggregated_problems problems + + ## Parses columns within a Table to a specific value type. By default, it looks at all `Text` columns and attempts to deduce the type (columns with other types are not affected). If `column_types` are diff --git a/std-bits/base/src/main/java/org/enso/base/text/TextFoldingStrategy.java b/std-bits/base/src/main/java/org/enso/base/text/TextFoldingStrategy.java new file mode 100644 index 000000000000..7c74fddae5b4 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/text/TextFoldingStrategy.java @@ -0,0 +1,27 @@ +package org.enso.base.text; + +import org.enso.base.Text_Utils; + +import java.util.Locale; + +/** A strategy for folding text values for comparison and hashing. */ +public interface TextFoldingStrategy { + String fold(String value); + + /** + * A folding strategy that ensures the strings are normalized, so various equivalent Unicode forms + * are equated. + */ + TextFoldingStrategy unicodeNormalizedFold = Text_Utils::normalize; + + /** + * A folding strategy that not only normalizes the Unicode strings but also ensures + * case-insensitive comparison. It needs a locale for locale-specific case handling. + */ + static TextFoldingStrategy caseInsensitiveFold(Locale locale) { + return (String value) -> { + String normalized = Text_Utils.normalize(value); + return Text_Utils.case_insensitive_key(normalized, locale); + }; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/aggregations/Concatenate.java b/std-bits/table/src/main/java/org/enso/table/aggregations/Concatenate.java index 8f0578503a92..82ca979e376a 100644 --- a/std-bits/table/src/main/java/org/enso/table/aggregations/Concatenate.java +++ b/std-bits/table/src/main/java/org/enso/table/aggregations/Concatenate.java @@ -1,11 +1,12 @@ package org.enso.table.aggregations; -import java.util.List; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.table.Column; import org.enso.table.data.table.problems.InvalidAggregation; import org.enso.table.data.table.problems.UnquotedDelimiter; +import java.util.List; + public class Concatenate extends Aggregator { private final Storage storage; private final String separator; diff --git a/std-bits/table/src/main/java/org/enso/table/aggregations/CountDistinct.java b/std-bits/table/src/main/java/org/enso/table/aggregations/CountDistinct.java index 4beb7b0474b2..43bd6d98552e 100644 --- a/std-bits/table/src/main/java/org/enso/table/aggregations/CountDistinct.java +++ b/std-bits/table/src/main/java/org/enso/table/aggregations/CountDistinct.java @@ -1,11 +1,14 @@ package org.enso.table.aggregations; import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.index.MultiValueKey; +import org.enso.table.data.index.UnorderedMultiValueKey; import org.enso.table.data.table.Column; import org.enso.table.data.table.problems.FloatingPointGrouping; -import java.util.*; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; /** * Aggregate Column counting the number of distinct items in a group. If `ignoreAllNull` is true, @@ -33,9 +36,9 @@ public CountDistinct( @Override public Object aggregate(List indexes) { - Set set = new HashSet<>(); + HashSet set = new HashSet<>(); for (int row : indexes) { - MultiValueKey key = new MultiValueKey(storage, row, objectComparator); + UnorderedMultiValueKey key = new UnorderedMultiValueKey(storage, row); if (key.hasFloatValues()) { this.addProblem(new FloatingPointGrouping(this.getName(), row)); } diff --git a/std-bits/table/src/main/java/org/enso/table/aggregations/First.java b/std-bits/table/src/main/java/org/enso/table/aggregations/First.java index 621d8a91e711..dcb6db4660bb 100644 --- a/std-bits/table/src/main/java/org/enso/table/aggregations/First.java +++ b/std-bits/table/src/main/java/org/enso/table/aggregations/First.java @@ -1,7 +1,7 @@ package org.enso.table.aggregations; import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.index.MultiValueKey; +import org.enso.table.data.index.OrderedMultiValueKey; import org.enso.table.data.table.Column; import java.util.Arrays; @@ -51,7 +51,7 @@ public Object aggregate(List indexes) { } private Object firstBySpecifiedOrder(List indexes) { - MultiValueKey key = null; + OrderedMultiValueKey key = null; Object current = null; for (int row : indexes) { @@ -60,8 +60,9 @@ private Object firstBySpecifiedOrder(List indexes) { continue; } - MultiValueKey newKey = - new MultiValueKey(this.orderByColumns, row, this.orderByDirections, objectComparator); + OrderedMultiValueKey newKey = + new OrderedMultiValueKey( + this.orderByColumns, row, this.orderByDirections, objectComparator); if (key == null || key.compareTo(newKey) > 0) { key = newKey; current = storage.getItemBoxed(row); diff --git a/std-bits/table/src/main/java/org/enso/table/aggregations/Last.java b/std-bits/table/src/main/java/org/enso/table/aggregations/Last.java index 129689ed268d..02843e4a39a7 100644 --- a/std-bits/table/src/main/java/org/enso/table/aggregations/Last.java +++ b/std-bits/table/src/main/java/org/enso/table/aggregations/Last.java @@ -1,7 +1,7 @@ package org.enso.table.aggregations; import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.index.MultiValueKey; +import org.enso.table.data.index.OrderedMultiValueKey; import org.enso.table.data.table.Column; import java.util.Arrays; @@ -50,7 +50,7 @@ public Object aggregate(List indexes) { } private Object lastBySpecifiedOrder(List indexes) { - MultiValueKey key = null; + OrderedMultiValueKey key = null; Object current = null; for (int i = indexes.size() - 1; i >= 0; i--) { @@ -60,8 +60,9 @@ private Object lastBySpecifiedOrder(List indexes) { continue; } - MultiValueKey newKey = - new MultiValueKey(this.orderByColumns, row, this.orderByDirections, objectComparator); + OrderedMultiValueKey newKey = + new OrderedMultiValueKey( + this.orderByColumns, row, this.orderByDirections, objectComparator); if (key == null || key.compareTo(newKey) < 0) { key = newKey; current = storage.getItemBoxed(row); diff --git a/std-bits/table/src/main/java/org/enso/table/aggregations/Percentile.java b/std-bits/table/src/main/java/org/enso/table/aggregations/Percentile.java index 2eabb3259a70..c8fb79341a3b 100644 --- a/std-bits/table/src/main/java/org/enso/table/aggregations/Percentile.java +++ b/std-bits/table/src/main/java/org/enso/table/aggregations/Percentile.java @@ -1,12 +1,13 @@ package org.enso.table.aggregations; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.table.Column; +import org.enso.table.data.table.problems.InvalidAggregation; + import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; -import org.enso.table.data.column.storage.Storage; -import org.enso.table.data.table.Column; -import org.enso.table.data.table.problems.InvalidAggregation; /** Aggregate Column computing a percentile value in a group. */ public class Percentile extends Aggregator { diff --git a/std-bits/table/src/main/java/org/enso/table/aggregations/StandardDeviation.java b/std-bits/table/src/main/java/org/enso/table/aggregations/StandardDeviation.java index 3324a162d261..041815aab547 100644 --- a/std-bits/table/src/main/java/org/enso/table/aggregations/StandardDeviation.java +++ b/std-bits/table/src/main/java/org/enso/table/aggregations/StandardDeviation.java @@ -1,10 +1,11 @@ package org.enso.table.aggregations; -import java.util.List; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.table.Column; import org.enso.table.data.table.problems.InvalidAggregation; +import java.util.List; + /** Aggregate Column computing the standard deviation of a group. */ public class StandardDeviation extends Aggregator { private static class Calculation { diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/DefaultIndex.java b/std-bits/table/src/main/java/org/enso/table/data/index/DefaultIndex.java index 48a308655349..57182748e32e 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/DefaultIndex.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/DefaultIndex.java @@ -1,12 +1,11 @@ package org.enso.table.data.index; -import org.enso.table.data.mask.OrderMask; -import org.enso.table.data.mask.SliceRange; -import org.enso.table.data.table.Column; - import java.util.BitSet; import java.util.Collections; import java.util.List; +import org.enso.table.data.mask.OrderMask; +import org.enso.table.data.mask.SliceRange; +import org.enso.table.data.table.Column; public class DefaultIndex extends Index { private final int size; diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/HashIndex.java b/std-bits/table/src/main/java/org/enso/table/data/index/HashIndex.java index c4ffc483862e..4aa13768bd40 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/HashIndex.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/HashIndex.java @@ -1,12 +1,11 @@ package org.enso.table.data.index; +import java.util.*; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.mask.OrderMask; import org.enso.table.data.mask.SliceRange; import org.enso.table.data.table.Column; -import java.util.*; - public class HashIndex extends Index { private final Storage items; private final Map> locs; diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/Index.java b/std-bits/table/src/main/java/org/enso/table/data/index/Index.java index c17e0a0d5cff..416285b72a50 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/Index.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/Index.java @@ -1,12 +1,11 @@ package org.enso.table.data.index; +import java.util.BitSet; +import java.util.List; import org.enso.table.data.mask.OrderMask; import org.enso.table.data.mask.SliceRange; import org.enso.table.data.table.Column; -import java.util.BitSet; -import java.util.List; - /** A storage class for ordered multisets. */ public abstract class Index { public static final int NOT_FOUND = -1; diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueIndex.java b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueIndex.java index 5de9c069a223..1c8478e591a4 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueIndex.java +++ b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueIndex.java @@ -1,37 +1,46 @@ package org.enso.table.data.index; +import java.util.*; +import java.util.function.IntFunction; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import org.enso.table.aggregations.Aggregator; -import org.enso.table.data.column.builder.object.StringBuilder; import org.enso.table.data.column.builder.object.*; +import org.enso.table.data.column.builder.object.StringBuilder; import org.enso.table.data.column.storage.Storage; import org.enso.table.data.table.Column; import org.enso.table.data.table.Table; import org.enso.table.data.table.problems.AggregatedProblems; import org.enso.table.data.table.problems.FloatingPointGrouping; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - public class MultiValueIndex { private final int keyColumnsLength; - private final Map> locs; + private final Map> locs; private final AggregatedProblems problems; public MultiValueIndex(Column[] keyColumns, int tableSize, Comparator objectComparator) { this(keyColumns, tableSize, null, objectComparator); } - public MultiValueIndex(Column[] keyColumns, int tableSize, int[] ordering, Comparator objectComparator) { + public MultiValueIndex( + Column[] keyColumns, int tableSize, int[] ordering, Comparator objectComparator) { this.keyColumnsLength = keyColumns.length; - this.locs = ordering == null ? new HashMap<>() : new TreeMap<>(); this.problems = new AggregatedProblems(); + boolean isOrdered = ordering != null; + this.locs = isOrdered ? new TreeMap<>() : new HashMap<>(); + + Storage[] storage = Arrays.stream(keyColumns).map(Column::getStorage).toArray(Storage[]::new); + IntFunction keyFactory = + isOrdered + ? i -> new OrderedMultiValueKey(storage, i, ordering, objectComparator) + : i -> new UnorderedMultiValueKey(storage, i); + if (keyColumns.length != 0) { int size = keyColumns[0].getSize(); - Storage[] storage = Arrays.stream(keyColumns).map(Column::getStorage).toArray(Storage[]::new); + for (int i = 0; i < size; i++) { - MultiValueKey key = new MultiValueKey(storage, i, ordering, objectComparator); + MultiValueKeyBase key = keyFactory.apply(i); if (key.hasFloatValues()) { problems.add(new FloatingPointGrouping("GroupBy", i)); @@ -41,7 +50,8 @@ public MultiValueIndex(Column[] keyColumns, int tableSize, int[] ordering, Compa ids.add(i); } } else { - this.locs.put(new MultiValueKey(new Storage[0], 0, objectComparator), IntStream.range(0, tableSize).boxed().collect(Collectors.toList())); + this.locs.put( + keyFactory.apply(0), IntStream.range(0, tableSize).boxed().collect(Collectors.toList())); } } diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKey.java b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKey.java deleted file mode 100644 index 79c5ce885c9c..000000000000 --- a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKey.java +++ /dev/null @@ -1,127 +0,0 @@ -package org.enso.table.data.index; - -import org.enso.table.data.column.storage.Storage; - -import java.util.Arrays; -import java.util.Comparator; - -public class MultiValueKey implements Comparable { - private final Storage[] storage; - private final int[] directions; - private final int rowIndex; - private final Comparator objectComparator; - private final int hashCodeValue; - private final boolean allNull; - private final boolean floatValue; - - public MultiValueKey(Storage[] storage, int rowIndex, Comparator objectComparator) { - this(storage, rowIndex, null, objectComparator); - } - - public MultiValueKey( - Storage[] storage, int rowIndex, int[] directions, Comparator objectComparator) { - this.storage = storage; - this.rowIndex = rowIndex; - - if (directions == null) { - directions = new int[storage.length]; - Arrays.fill(directions, 1); - } - this.directions = directions; - - this.objectComparator = objectComparator; - - boolean allNull = true; - boolean floatValue = false; - - // Precompute HashCode - using Apache.Commons.Collections.Map.MultiKeyMap.hash algorithm - int h = 1; - for (int i = 0; i < storage.length; i++) { - h = 31 * h; - - Object value = this.get(i); - if (value != null) { - Object folded = foldObject(value); - floatValue = floatValue || (folded instanceof Double); - h += folded.hashCode(); - allNull = false; - } - } - - this.hashCodeValue = h; - this.allNull = allNull; - this.floatValue = floatValue; - } - - public Object get(int column) { - return storage[column].getItemBoxed(rowIndex); - } - - @Override - public int hashCode() { - return this.hashCodeValue; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof MultiValueKey that)) return false; - if (storage.length != that.storage.length) return false; - if (hashCodeValue != that.hashCodeValue) return false; - for (int i = 0; i < storage.length; i++) { - if (objectComparator.compare(get(i), that.get(i)) != 0) { - return false; - } - } - - return true; - } - - public boolean areAllNull() { - return allNull; - } - - public boolean hasFloatValues() { - return floatValue; - } - - protected static Object foldObject(Object value) { - if (value instanceof Long) { - return value; - } else if (value instanceof Integer) { - return ((Integer) value).longValue(); - } else if (value instanceof Byte) { - return ((Byte) value).longValue(); - } else if (value instanceof Float && ((Float) value) % 1 == 0) { - return ((Float) value).longValue(); - } else if (value instanceof Double && ((Double) value) % 1 == 0) { - return ((Double) value).longValue(); - } else if (value instanceof Float) { - return ((Float) value).doubleValue(); - } else if (value instanceof Double) { - return value; - } - - return value; - } - - @Override - public int compareTo(MultiValueKey that) { - if (objectComparator == null || that == null) { - throw new NullPointerException(); - } - - if (that.storage.length != storage.length) { - throw new ClassCastException("Incomparable keys."); - } - - for (int i = 0; i < storage.length; i++) { - int comparison = objectComparator.compare(get(i), that.get(i)); - if (comparison != 0) { - return comparison * directions[i]; - } - } - - return 0; - } -} diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java new file mode 100644 index 000000000000..998dde2178ce --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/index/MultiValueKeyBase.java @@ -0,0 +1,65 @@ +package org.enso.table.data.index; + +import org.enso.table.data.column.storage.Storage; + +/** The base class for keys used for sorting/grouping rows by a set of columns. */ +public abstract class MultiValueKeyBase { + protected final Storage[] storages; + protected final int rowIndex; + protected boolean hasFloatValues = false; + protected boolean floatsComputed = false; + + /** + * Constructs a key based on an array of column storages and the index of the row the key is + * associated with. + */ + public MultiValueKeyBase(Storage[] storage, int rowIndex) { + this.storages = storage; + this.rowIndex = rowIndex; + } + + /** A helper function to get the item from the nth column of the key's row. */ + protected Object get(int column) { + return storages[column].getItemBoxed(rowIndex); + } + + @Override + public abstract boolean equals(Object o); + + /** Checks if all cells in the current row are missing. */ + public boolean areAllNull() { + for (Storage value : storages) { + if (!value.isNa(rowIndex)) { + return false; + } + } + return true; + } + + /* Checks if any cell contains float values. + + It takes value folding into account, i.e. a float value that can be coerced to an integer without loss of precision is not considered floating. + */ + public boolean hasFloatValues() { + if (!floatsComputed) { + hasFloatValues = findFloats(); + floatsComputed = true; + } + + return hasFloatValues; + } + + protected boolean isFloatingPoint(Object value) { + return value instanceof Double || value instanceof Float; + } + + private boolean findFloats() { + for (int i = 0; i < storages.length; i++) { + Object value = this.get(i); + if (isFloatingPoint(value)) { + return true; + } + } + return false; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java b/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java new file mode 100644 index 000000000000..9a43aaf8f772 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/index/OrderedMultiValueKey.java @@ -0,0 +1,70 @@ +package org.enso.table.data.index; + +import java.util.Arrays; +import java.util.Comparator; +import org.enso.table.data.column.storage.Storage; + +/** + * A multi-value key for ordered operations like sorting. + * + *

It is meant to be used by sorted collections relying on {@code compareTo}, like {@code + * TreeMap}. It uses an {@code objectComparator} that should expose the Enso comparison logic to the + * Java-verse. + * + *

It currently does not support hashing, as we do not have a hashing implementation consistent + * with Enso's comparison semantics. + */ +public class OrderedMultiValueKey extends MultiValueKeyBase + implements Comparable { + private final Comparator objectComparator; + + private final int[] directions; + + public OrderedMultiValueKey( + Storage[] storages, int rowIndex, int[] directions, Comparator objectComparator) { + super(storages, rowIndex); + this.objectComparator = objectComparator; + this.directions = directions; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof MultiValueKeyBase that)) return false; + if (storages.length != that.storages.length) return false; + for (int i = 0; i < storages.length; i++) { + if (objectComparator.compare(get(i), that.get(i)) != 0) { + return false; + } + } + + return true; + } + + @Override + public int compareTo(OrderedMultiValueKey that) { + if (objectComparator == null || that == null) { + throw new NullPointerException(); + } + + if (that.storages.length != storages.length) { + throw new ClassCastException("Incomparable keys."); + } + + for (int i = 0; i < storages.length; i++) { + int comparison = objectComparator.compare(get(i), that.get(i)); + if (comparison != 0) { + return comparison * directions[i]; + } + } + + return 0; + } + + @Override + public int hashCode() { + throw new IllegalStateException( + "Currently no hash_code implementation consistent with the ObjectComparator is exposed, so" + + " OrderedMultiValueKey is not hashable."); + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/index/UnorderedMultiValueKey.java b/std-bits/table/src/main/java/org/enso/table/data/index/UnorderedMultiValueKey.java new file mode 100644 index 000000000000..4b4d53feb4cd --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/data/index/UnorderedMultiValueKey.java @@ -0,0 +1,131 @@ +package org.enso.table.data.index; + +import java.time.LocalDate; +import java.time.LocalTime; +import java.time.ZonedDateTime; +import java.util.Objects; + +import org.enso.base.text.TextFoldingStrategy; +import org.enso.table.data.column.storage.Storage; + +/** + * A multi-value key for unordered operations like group-by or distinct. + * + *

It relies on folding logic that coerces values to their representatives in such a way that + * their equality is consistent with how Enso would handle equality. + * + *

As it relies on hashing, it currently is not prepared to work correctly for custom + * Enso-defined objects, as hashing of such objects is not yet implemented properly. + */ +public class UnorderedMultiValueKey extends MultiValueKeyBase { + private final int hashCodeValue; + private final TextFoldingStrategy textFoldingStrategy; + + public UnorderedMultiValueKey(Storage[] storages, int rowIndex) { + this(storages, rowIndex, TextFoldingStrategy.unicodeNormalizedFold); + } + + public UnorderedMultiValueKey( + Storage[] storages, int rowIndex, TextFoldingStrategy textFoldingStrategy) { + super(storages, rowIndex); + this.textFoldingStrategy = textFoldingStrategy; + + // Precompute HashCode - using Apache.Commons.Collections.Map.MultiKeyMap.hash algorithm + int h = 1; + for (int i = 0; i < storages.length; i++) { + h = 31 * h; + + Object value = this.get(i); + if (value != null) { + hasFloatValues = hasFloatValues || isFloatingPoint(value); + Object folded = foldObject(value); + h += folded.hashCode(); + } + } + + this.hashCodeValue = h; + floatsComputed = true; + } + + /** + * Folds the value to ensure consistency with Enso's equality. + * + *

Case-sensitivity of text folding is controlled by {@code textFoldingStrategy}. + */ + protected Object foldObject(Object value) { + if (value == null) { + return null; + } + + if (value instanceof String s) { + return textFoldingStrategy.fold(s); + } + + Object numeric = foldNumeric(value); + if (numeric != null) { + return numeric; + } + + if (value instanceof Boolean) { + return value; + } + + if (value instanceof LocalDate || value instanceof LocalTime || value instanceof ZonedDateTime) { + return value; + } + + throw new IllegalArgumentException("Custom objects in UnorderedMultiValueKey are currently not supported due to lack of hashing support."); + } + + /** + * If the value is a numeric type, this method coerces it in such a way to ensure consistency with + * Enso. + * + *

Integer types are coerced to {@code Long} and floating point values are coerced to {@code + * Double} unless they represent a whole integer in which case they are also coerced to {@code + * Long}, to ensure the Enso property that {@code 2 == 2.0}. + * + * Returns {@code null} if the value was not a numeric value. + */ + protected Object foldNumeric(Object value) { + if (value instanceof Long) { + return value; + } else if (value instanceof Integer i) { + return i.longValue(); + } else if (value instanceof Byte b) { + return b.longValue(); + } else if (value instanceof Float f && f % 1 == 0) { + return f.longValue(); + } else if (value instanceof Double d && d % 1 == 0) { + return d.longValue(); + } else if (value instanceof Float f) { + return f.doubleValue(); + } else if (value instanceof Double d) { + return d; + } + + return null; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof UnorderedMultiValueKey that)) return false; + if (storages.length != that.storages.length) return false; + if (hashCodeValue != that.hashCodeValue) return false; + for (int i = 0; i < storages.length; i++) { + Object thisFolded = foldObject(this.get(i)); + Object thatFolded = foldObject(that.get(i)); + if (!Objects.equals(thisFolded, thatFolded)) { + return false; + } + } + + return true; + } + + @Override + public int hashCode() { + return this.hashCodeValue; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/data/table/Table.java b/std-bits/table/src/main/java/org/enso/table/data/table/Table.java index f1398d3cf75c..6ba915589e3d 100644 --- a/std-bits/table/src/main/java/org/enso/table/data/table/Table.java +++ b/std-bits/table/src/main/java/org/enso/table/data/table/Table.java @@ -1,5 +1,6 @@ package org.enso.table.data.table; +import org.enso.base.text.TextFoldingStrategy; import org.enso.table.data.column.builder.object.InferredBuilder; import org.enso.table.data.column.storage.BoolStorage; import org.enso.table.data.column.storage.Storage; @@ -12,6 +13,7 @@ import org.enso.table.data.table.problems.AggregatedProblems; import org.enso.table.error.NoSuchColumnException; import org.enso.table.error.UnexpectedColumnTypeException; +import org.enso.table.operations.Distinct; import java.util.*; import java.util.stream.Collectors; @@ -231,6 +233,26 @@ public Table orderBy(Column[] columns, Long[] directions, Comparator obj return this.applyMask(mask); } + /** + * Creates a new table keeping only rows with distinct key columns. + * + * @param keyColumns set of columns to use as an Index + * @param textFoldingStrategy a strategy for folding text columns + * @return a table where duplicate rows with the same key are removed + */ + public Table distinct(Column[] keyColumns, TextFoldingStrategy textFoldingStrategy) { + var problems = new AggregatedProblems(); + var rowsToKeep = Distinct.buildDistinctRowsMask(rowCount(), keyColumns, textFoldingStrategy, problems); + int cardinality = rowsToKeep.cardinality(); + Column[] newColumns = new Column[this.columns.length]; + Index newIx = index.mask(rowsToKeep, cardinality); + for (int i = 0; i < this.columns.length; i++) { + newColumns[i] = this.columns[i].mask(newIx, rowsToKeep, cardinality); + } + + return new Table(newColumns, newIx, problems); + } + /** * Selects a subset of columns of this table, by names. * diff --git a/std-bits/table/src/main/java/org/enso/table/operations/Distinct.java b/std-bits/table/src/main/java/org/enso/table/operations/Distinct.java new file mode 100644 index 000000000000..7ff4c7114438 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/operations/Distinct.java @@ -0,0 +1,44 @@ +package org.enso.table.operations; + +import java.util.*; + +import org.enso.base.text.TextFoldingStrategy; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.index.MultiValueKeyBase; +import org.enso.table.data.index.UnorderedMultiValueKey; +import org.enso.table.data.table.Column; +import org.enso.table.data.table.problems.AggregatedProblems; +import org.enso.table.data.table.problems.FloatingPointGrouping; + +public class Distinct { + /** Creates a row mask containing only the first row from sets of rows grouped by key columns. */ + public static BitSet buildDistinctRowsMask( + int tableSize, + Column[] keyColumns, + TextFoldingStrategy textFoldingStrategy, + AggregatedProblems problems) { + var mask = new BitSet(); + if (keyColumns.length != 0) { + HashSet visitedRows = new HashSet<>(); + int size = keyColumns[0].getSize(); + Storage[] storage = Arrays.stream(keyColumns).map(Column::getStorage).toArray(Storage[]::new); + for (int i = 0; i < size; i++) { + UnorderedMultiValueKey key = new UnorderedMultiValueKey(storage, i, textFoldingStrategy); + + if (key.hasFloatValues()) { + problems.add(new FloatingPointGrouping("Distinct", i)); + } + + if (!visitedRows.contains(key)) { + mask.set(i); + visitedRows.add(key); + } + } + } else { + // If there are no columns to distinct-by we just return the whole table. + mask.set(0, tableSize); + } + + return mask; + } +} diff --git a/test/Table_Tests/src/Aggregate_Column_Spec.enso b/test/Table_Tests/src/Aggregate_Column_Spec.enso index 62d1215c52c6..0398190f7e16 100644 --- a/test/Table_Tests/src/Aggregate_Column_Spec.enso +++ b/test/Table_Tests/src/Aggregate_Column_Spec.enso @@ -31,7 +31,8 @@ spec = Test.group "Aggregate Columns" <| result = acc = Aggregate_Column_Helper.java_aggregator "Name" resolved indexes = Vector.new table.row_count v->v - acc.aggregate indexes.to_array + Illegal_Argument_Error.handle_java_exception <| + acc.aggregate indexes.to_array if epsilon != False then ((result - expected_result).abs < epsilon).should_be_true else result.should_equal expected_result diff --git a/test/Table_Tests/src/Aggregate_Spec.enso b/test/Table_Tests/src/Aggregate_Spec.enso index 0c2065ee6aea..8c6d105d4584 100644 --- a/test/Table_Tests/src/Aggregate_Spec.enso +++ b/test/Table_Tests/src/Aggregate_Spec.enso @@ -1255,7 +1255,8 @@ aggregate_spec prefix table empty_table table_builder materialize is_database te Test.specify "should warn if grouping on a floating point" <| action = table.aggregate [Group_By 1] on_problems=_ - problems = [Floating_Point_Grouping_Data "GroupBy" [2]] + # All rows are marked as floating point, because the integers get coerced to double when stored in DoubleStorage + problems = [Floating_Point_Grouping_Data "GroupBy" [0, 1, 2]] tester = expect_column_names ["Value"] Problems.test_problem_handling action problems tester @@ -1333,7 +1334,7 @@ aggregate_spec prefix table empty_table table_builder materialize is_database te problems = Warning.get_all new_table . map .value problems.length . should_equal 1 problems.at 0 . is_an Floating_Point_Grouping_Data . should_be_true - problems.at 0 . rows . length . should_equal 9 + problems.at 0 . rows . length . should_equal 15 if is_database then Test.group prefix+"Table.aggregate should report unsupported operations but not block other aggregations in warning mode" pending=pending <| diff --git a/test/Table_Tests/src/Table_Spec.enso b/test/Table_Tests/src/Table_Spec.enso index 210516f9da81..890487f684b9 100644 --- a/test/Table_Tests/src/Table_Spec.enso +++ b/test/Table_Tests/src/Table_Spec.enso @@ -1,4 +1,5 @@ from Standard.Base import all +from Standard.Base.Error.Problem_Behavior import Report_Error import Standard.Table from Standard.Table import Column, Sort_Column, Sort_Column_Selector @@ -7,6 +8,8 @@ from Standard.Table.Errors as Table_Errors import Invalid_Output_Column_Names_Da import Standard.Table.Data.Storage import Standard.Table.Data.Aggregate_Column from Standard.Table.Data.Aggregate_Column import all hiding First, Last +from Standard.Table.Data.Column_Selector import By_Name +from Standard.Table.Errors import Floating_Point_Grouping_Data import Standard.Visualization @@ -14,6 +17,7 @@ import Standard.Test import Standard.Test.Problems import project.Common_Table_Spec +from project.Util import all type My My_Data x y @@ -633,8 +637,8 @@ spec = t1.info.at "Column" . to_vector . should_equal ["dates", "Shortest texts", "First texts", "First objects", "First ints", "Last mixed"] t1.info.at "Storage Type" . to_vector . should_equal [Storage.Date, Storage.Text, Storage.Text, Storage.Any, Storage.Integer, Storage.Any] - t2 = table.aggregate [Mode "dates", Count_Distinct "objects", Count_Distinct "texts", Minimum "ints", Maximum "floats"] - t2.info.at "Column" . to_vector . should_equal ["Mode dates", "Count Distinct objects", "Count Distinct texts", "Minimum ints", "Maximum floats"] + t2 = table.aggregate [Mode "dates", Count_Not_Nothing "objects", Count_Distinct "texts", Minimum "ints", Maximum "floats"] + t2.info.at "Column" . to_vector . should_equal ["Mode dates", "Count Not Nothing objects", "Count Distinct texts", "Minimum ints", "Maximum floats"] t2.info.at "Storage Type" . to_vector . should_equal [Storage.Date, Storage.Integer, Storage.Integer, Storage.Integer, Storage.Decimal] t3 = table.aggregate [Group_By "texts", Group_By "ints", Aggregate_Column.Last "floats"] @@ -645,5 +649,91 @@ spec = t4.info.at "Column" . to_vector . should_equal ["mixed", "Sum ints", "Sum floats"] t4.info.at "Storage Type" . to_vector . should_equal [Storage.Any, Storage.Decimal, Storage.Decimal] + ## We have a separate set of tests for In-Memory distinct as it gives us + more guarantees: preserving order of rows and always selecting the first + row of ones sharing the same distinctness key. For database tests (to be + added later) we can not rely on ordering. + Test.group "[In-Memory] Table.distinct" <| + Test.specify "should allow to select distinct rows based on a subset of columns" <| + a = ["A", ["a", "a", "a", "a", "a", "a"]] + b = ["B", [1, 1, 2, 2, 1, 2]] + c = ["C", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]] + t = Table.new [a, b, c] + + r1 = t.distinct (By_Name ["A"]) on_problems=Report_Error + r1.at "A" . to_vector . should_equal ["a"] + r1.at "B" . to_vector . should_equal [1] + r1.at "C" . to_vector . should_equal [0.1] + + r2 = t.distinct (By_Name ["A", "B"]) on_problems=Report_Error + r2.at "A" . to_vector . should_equal ["a", "a"] + r2.at "B" . to_vector . should_equal [1, 2] + r2.at "C" . to_vector . should_equal [0.1, 0.3] + + Test.specify "should handle nulls correctly" <| + a = ["A", ["a", Nothing, "b", "a", "b", Nothing, "a", "b"]] + b = ["B", [1, 2, 3, 4, 5, 6, 7, 8]] + t = Table.new [a, b] + r = t.distinct (By_Name ["A"]) on_problems=Report_Error + r.at "A" . to_vector . should_equal ["a", Nothing, "b"] + r.at "B" . to_vector . should_equal [1, 2, 3] + + Test.specify "should handle Unicode normalization of keys correctly" <| + t1 = Table.new [["X", ['ś', 's\u0301', 's', 'ś']]] + t1.distinct . at "X" . to_vector . should_equal ['ś', 's'] + + Test.specify "should allow to control case-sensitivity of keys" <| + x = ["X", ['A', 'a', 'enso', 'śledź', 'Enso', 'A', 's\u0301ledz\u0301']] + y = ["Y", [1, 2, 3, 4, 5, 6, 7]] + t1 = Table.new [x, y] + d1 = t1.distinct (By_Name ["X"]) on_problems=Report_Error + d1.at "X" . to_vector . should_equal ['A', 'a', 'enso', 'śledź', 'Enso'] + d1.at "Y" . to_vector . should_equal [1, 2, 3, 4, 5] + + d2 = t1.distinct (By_Name ["X"]) case_sensitive=Case_Insensitive_Data on_problems=Report_Error + d2.at "X" . to_vector . should_equal ['A', 'enso', 'śledź'] + d2.at "Y" . to_vector . should_equal [1, 3, 4] + + t2 = Table.new [["X", ["łąka", "STRASSE", "Straße", "ffi", "ŁĄka", "ffi"]]] + t2.distinct case_sensitive=Case_Insensitive_Data . at "X" . to_vector . should_equal ["łąka", "STRASSE", "ffi"] + + Test.specify "should report a warning if the key contains floating point values" <| + t1 = Table.new [["X", [3.0, 1.0, 2.0, 2.0, 1.0]]] + action1 = t1.distinct on_problems=_ + tester1 table = + table.at "X" . to_vector . should_equal [3.0, 1.0, 2.0] + problems1 = [Floating_Point_Grouping_Data "Distinct" [0, 1, 2, 3, 4]] + Problems.test_problem_handling action1 problems1 tester1 + + t2 = Table.new [["X", [1.00000000000001, -0.3, 1.00000000000002, 1.5, 1.00000000000002, 1.00000000000002]]] + action2 = t2.distinct on_problems=_ + tester2 table = + table.at "X" . to_vector . should_equal [1.00000000000001, -0.3, 1.00000000000002, 1.5] + problems2 = [Floating_Point_Grouping_Data "Distinct" [0, 1, 2, 3, 4, 5]] + Problems.test_problem_handling action2 problems2 tester2 + + Test.specify "should report a warning and report the whole table if no columns were selected" <| + t = Table.new [["A", [1, 2, 1, 1]]] + test table = table.should_equal t + + action1 = t.distinct (By_Name []) on_problems=_ + problems1 = [No_Input_Columns_Selected] + Problems.test_problem_handling action1 problems1 test + + action2 = t.distinct (By_Name ["mismatched"]) on_problems=_ + problems2 = [Missing_Input_Columns_Data ["mismatched"], No_Input_Columns_Selected] + Problems.test_problem_handling action2 problems2 test + + Test.specify "until hashing is supported, should throw an error when trying to aggregate a custom object" <| + t = Table.new [["X", [My_Data 1 2, My_Data 3 4, My_Data 1 2]]] + t.distinct . should_fail_with Illegal_Argument_Error_Data + + Test.specify "should group by all columns by default" <| + a = ["A", ["a", "b", "a", "b", "a", "b"]] + b = ["B", [2, 1, 2, 2, 2, 1]] + t = Table.new [a, b] + r = t.distinct on_problems=Report_Error + r.at "A" . to_vector . should_equal ["a", "b", "b"] + r.at "B" . to_vector . should_equal [2, 1, 2] main = Test.Suite.run_main spec diff --git a/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/copyright-ignore b/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/copyright-ignore deleted file mode 100644 index d8c40518228e..000000000000 --- a/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/copyright-ignore +++ /dev/null @@ -1,203 +0,0 @@ -2000.01.01 copyright update [Y2K has arrived] */ -2000.01.01 1.06 copyright update */ -

Copyright © IBM Corporation 1999. All rights reserved. -Copyright (C) 1996-2005, International Business Machines Corporation and * -Copyright (C) 1996-2007, International Business Machines Corporation and * -Copyright (C) 1996-2007, International Business Machines Corporation and * -Copyright (C) 1996-2009, Google, International Business Machines Corporation and * -Copyright (C) 1996-2009, International Business Machines Corporation and * -Copyright (C) 1996-2010, International Business Machines Corporation and -Copyright (C) 1996-2010, International Business Machines Corporation and * -Copyright (C) 1996-2010, International Business Machines Corporation and * -Copyright (C) 1996-2011, International Business Machines Corporation and -Copyright (C) 1996-2011, International Business Machines Corporation and * -Copyright (C) 1996-2011, International Business Machines Corporation and * -Copyright (C) 1996-2012, International Business Machines Corporation and * -Copyright (C) 1996-2013, International Business Machines Corporation and -Copyright (C) 1996-2013, International Business Machines Corporation and * -Copyright (C) 1996-2014, International Business Machines Corporation and -Copyright (C) 1996-2014, International Business Machines Corporation and * -Copyright (C) 1996-2015, Google, Inc., International Business Machines Corporation and -Copyright (C) 1996-2015, International Business Machines -Copyright (C) 1996-2015, International Business Machines Corporation and -Copyright (C) 1996-2015, International Business Machines Corporation and * -Copyright (C) 1996-2016, Google, International Business Machines Corporation and -Copyright (C) 1996-2016, International Business Machines -Copyright (C) 1996-2016, International Business Machines Corporation and -Copyright (C) 1996-2016, International Business Machines Corporation and * -Copyright (C) 1996-2016, International Business Machines Corporation and * -Copyright (C) 1999-2014, International Business Machines -Copyright (C) 1999-2015, International Business Machines -Copyright (C) 2000-2009, International Business Machines Corporation and * -Copyright (C) 2000-2014, International Business Machines -Copyright (C) 2000-2014, International Business Machines Corporation and -Copyright (C) 2000-2016, International Business Machines Corporation and -Copyright (C) 2001-2004, International Business Machines Corporation and * -Copyright (C) 2001-2008, International Business Machines -Copyright (C) 2001-2009, International Business Machines Corporation and * -Copyright (C) 2001-2010, International Business Machines -Copyright (C) 2001-2010, International Business Machines Corporation and * -Copyright (C) 2001-2011, International Business Machines Corporation and * -Copyright (C) 2001-2012, International Business Machines -Copyright (C) 2001-2013, International Business Machines Corporation and * -Copyright (C) 2001-2015, International Business Machines Corporation and -Copyright (C) 2001-2016 International Business Machines Corporation and -Copyright (C) 2001-2016, International Business Machines -Copyright (C) 2001-2016, International Business Machines Corporation and -Copyright (C) 2001-2016, International Business Machines Corporation and * -Copyright (C) 2002-2009 International Business Machines Corporation * -Copyright (C) 2002-2010, International Business Machines -Copyright (C) 2002-2010, International Business Machines Corporation and * -Copyright (C) 2002-2014, International Business Machines Corporation and others. -Copyright (C) 2002-2016, International Business Machines Corporation and -Copyright (C) 2003-2010, International Business Machines -Copyright (C) 2003-2011, International Business Machines Corporation and * -Copyright (C) 2003-2011, International Business Machines Corporation and * -Copyright (C) 2003-2014, International Business Machines Corporation and -Copyright (C) 2003-2014, International Business Machines Corporation and * -Copyright (C) 2003-2015, International Business Machines Corporation and -Copyright (C) 2003-2016, Google, International Business Machines Corporation -Copyright (C) 2003-2016, International Business Machines Corporation and -Copyright (C) 2003-2016, International Business Machines Corporation and * -Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved. -Copyright (C) 2004-2006, International Business Machines Corporation and * -Copyright (C) 2004-2009, International Business Machines Corporation and * -Copyright (C) 2004-2009, International Business Machines Corporation and * -Copyright (C) 2004-2010, International Business Machines -Copyright (C) 2004-2010, International Business Machines Corporation and * -Copyright (C) 2004-2014, International Business Machines Corporation and -Copyright (C) 2004-2015, International Business Machines -Copyright (C) 2004-2016, Google Inc, International Business Machines -Copyright (C) 2004-2016, International Business Machines Corporation and -Copyright (C) 2004-2016, International Business Machines Corporation and * -Copyright (C) 2005 - 2012, International Business Machines Corporation and * -Copyright (C) 2005 - 2014, International Business Machines Corporation and * -Copyright (C) 2005-2006, International Business Machines -Copyright (C) 2005-2010, International Business Machines -Copyright (C) 2005-2011, International Business Machines Corporation and * -Copyright (C) 2005-2012, International Business Machines Corporation and * -Copyright (C) 2005-2012, International Business Machines Corporation and * -Copyright (C) 2005-2013, International Business Machines Corporation and * -Copyright (C) 2005-2015, International Business Machines Corporation and -Copyright (C) 2005-2016 International Business Machines Corporation and -Copyright (C) 2005-2016, International Business Machines Corporation and -Copyright (C) 2005-2016, International Business Machines Corporation and * -Copyright (C) 2006-2009, Google, International Business Machines Corporation * -Copyright (C) 2006-2015, International Business Machines Corporation and -Copyright (C) 2006-2016, Google, International Business Machines Corporation -Copyright (C) 2007, International Business Machines Corporation and * -Copyright (C) 2007-2008, International Business Machines Corporation and * -Copyright (C) 2007-2009, International Business Machines Corporation and * -Copyright (C) 2007-2010, International Business Machines Corporation and * -Copyright (C) 2007-2010, International Business Machines Corporation and * -Copyright (C) 2007-2011, International Business Machines Corporation and * -Copyright (C) 2007-2011, International Business Machines Corporation and others. -Copyright (C) 2007-2012, International Business Machines Corporation and * -Copyright (C) 2007-2013, International Business Machines Corporation and * -Copyright (C) 2007-2014, International Business Machines Corporation and * -Copyright (C) 2007-2014, International Business Machines Corporation and * -Copyright (C) 2007-2015, Google Inc, International Business Machines Corporation -Copyright (C) 2007-2015, International Business Machines Corporation and -Copyright (C) 2007-2015, International Business Machines Corporation and * -Copyright (C) 2007-2016, International Business Machines -Copyright (C) 2007-2016, International Business Machines Corporation and -Copyright (C) 2007-2016, International Business Machines Corporation and * -Copyright (C) 2008-2009, Google, International Business Machines -Copyright (C) 2008-2009, International Business Machines -Copyright (C) 2008-2014, Google, International Business Machines -Copyright (C) 2008-2014, International Business Machines Corporation and * -Copyright (C) 2008-2015, International Business Machines Corporation and -Copyright (C) 2008-2016 International Business Machines Corporation -Copyright (C) 2008-2016, Google Inc, International Business Machines Corporation -Copyright (C) 2008-2016, International Business Machines -Copyright (C) 2008-2016, International Business Machines Corporation and -Copyright (C) 2009 , Yahoo! Inc. * -Copyright (C) 2009, Google, International Business Machines Corporation and * -Copyright (C) 2009, International Business Machines Corporation and * -Copyright (C) 2009, International Business Machines Corporation and * -Copyright (C) 2009,2016 International Business Machines Corporation and -Copyright (C) 2009-2010, Google, International Business Machines Corporation * -Copyright (C) 2009-2010, International Business Machines Corporation and * -Copyright (C) 2009-2011, Google, International Business Machines Corporation -Copyright (C) 2009-2011, International Business Machines Corporation and * -Copyright (C) 2009-2012, International Business Machines Corporation and * -Copyright (C) 2009-2013, International Business Machines Corporation and * -Copyright (C) 2009-2014, International Business Machines -Copyright (C) 2009-2014, International Business Machines Corporation and -Copyright (C) 2009-2014, International Business Machines Corporation and * -Copyright (C) 2009-2015, Google, International Business Machines Corporation -Copyright (C) 2009-2015, International Business Machines -Copyright (C) 2009-2015, International Business Machines Corporation and -Copyright (C) 2009-2015, International Business Machines Corporation and * -Copyright (C) 2009-2016, Google, Inc.; International Business Machines Corporation -Copyright (C) 2009-2016, International Business Machines -Copyright (C) 2009-2016, International Business Machines Corporation and -Copyright (C) 2009-2016, International Business Machines Corporation and * -Copyright (C) 2009-2016, International Business Machines Corporation, -Copyright (C) 2010, International Business Machines -Copyright (C) 2010, International Business Machines Corporation and * -Copyright (C) 2010-2011, Google, International Business Machines * -Copyright (C) 2010-2013, International Business Machines Corporation and * -Copyright (C) 2010-2014, Google, International Business Machines Corporation * -Copyright (C) 2010-2014, International Business Machines -Copyright (C) 2010-2015, International Business Machines -Copyright (C) 2010-2016, Google, Inc.; International Business Machines * -Copyright (C) 2010-2016, International Business Machines -Copyright (C) 2011, International Business Machines -Copyright (C) 2011, International Business Machines Corporation and * -Copyright (C) 2011-2014, International Business Machines -Copyright (C) 2011-2016, International Business Machines Corporation -Copyright (C) 2011-2016, International Business Machines Corporation and -Copyright (C) 2011-2016, International Business Machines Corporation and * -Copyright (C) 2012, International Business Machines Corporation and * -Copyright (C) 2012-2014, International Business Machines -Copyright (C) 2012-2014, International Business Machines Corporation and * -Copyright (C) 2012-2015, International Business Machines -Copyright (C) 2012-2015, International Business Machines Corporation and * -Copyright (C) 2012-2016, Google, International Business Machines Corporation and -Copyright (C) 2012-2016, International Business Machines -Copyright (C) 2012-2016, International Business Machines Corporation and -Copyright (C) 2012-2016, International Business Machines Corporation and * -Copyright (C) 2013, Google Inc, International Business Machines Corporation and * -Copyright (C) 2013, International Business Machines Corporation and * -Copyright (C) 2013-2014, International Business Machines -Copyright (C) 2013-2014, International Business Machines Corporation and * -Copyright (C) 2013-2015, International Business Machines -Copyright (C) 2013-2015, International Business Machines Corporation and -Copyright (C) 2013-2016, International Business Machines Corporation and -Copyright (C) 2014, International Business Machines Corporation and -Copyright (C) 2014, International Business Machines Corporation and * -Copyright (C) 2014-2015, International Business Machines Corporation and -Copyright (C) 2014-2016, International Business Machines Corporation and -Copyright (C) 2015, International Business Machines Corporation and -Copyright (C) 2015-2016, International Business Machines Corporation and -Copyright (C) 2016, International Business Machines Corporation and -Copyright (C) 2016, International Business Machines Corporation and * -Copyright (c) 2001-2011, International Business Machines -Copyright (c) 2001-2016, International Business Machines -Copyright (c) 2001-2016, International Business Machines Corporation and -Copyright (c) 2002, International Business Machines Corporation -Copyright (c) 2002-2007, International Business Machines Corporation -Copyright (c) 2002-2010, International Business Machines -Copyright (c) 2002-2010, International Business Machines Corporation -Copyright (c) 2002-2011, International Business Machines Corporation -Copyright (c) 2002-2014, Google, International Business Machines -Copyright (c) 2002-2014, International Business Machines Corporation -Copyright (c) 2002-2015, International Business Machines -Copyright (c) 2002-2016, International Business Machines -Copyright (c) 2003-2010, International Business Machines -Copyright (c) 2003-2011, International Business Machines -Copyright (c) 2003-2016 International Business Machines -Copyright (c) 2003-2016, International Business Machines -Copyright (c) 2004-2010, International Business Machines -Copyright (c) 2004-2013, International Business Machines -Copyright (c) 2004-2014, International Business Machines -Copyright (c) 2004-2015, International Business Machines -Copyright (c) 2004-2016, International Business Machines -Copyright (c) 2007-2009 International Business Machines Corporation and * -Copyright (c) 2007-2015 International Business Machines Corporation and * -Copyright (c) 2013-2014, International Business Machines -Copyright IBM Corporation, 1996-2016. All Rights Reserved. */ -Copyright IBM Corporation, 1997, 2000, 2005, 2007. All Rights Reserved. */ -Does not write a copyright string. -copyright=" Copyright (c) IBM Corporation 1996, 2000. All rights reserved. "; diff --git a/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/copyright-keep b/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/copyright-keep deleted file mode 100644 index f62dec1437f5..000000000000 --- a/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/copyright-keep +++ /dev/null @@ -1,2 +0,0 @@ -Copyright (C) 1996-2004, International Business Machines Corporation and * -Copyright (C) 2001-2014, International Business Machines diff --git a/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/custom-license b/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/custom-license deleted file mode 100644 index 6b1d0bfabc3c..000000000000 --- a/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/custom-license +++ /dev/null @@ -1 +0,0 @@ -LICENSE diff --git a/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/files-keep b/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/files-keep deleted file mode 100644 index 6b1d0bfabc3c..000000000000 --- a/tools/legal-review/Table/com.ibm.icu.icu4j-71.1/files-keep +++ /dev/null @@ -1 +0,0 @@ -LICENSE diff --git a/tools/legal-review/Table/report-state b/tools/legal-review/Table/report-state index 4d6ed8bccf9d..f7615b777780 100644 --- a/tools/legal-review/Table/report-state +++ b/tools/legal-review/Table/report-state @@ -1,3 +1,3 @@ -3319119D931FA7A17240698DB17FF158F1BE4B624E36CA299EC49AC40564270D -3DAA51F9268FF48C287908A4AB284DB70A850F3F5D535A01700DFF05061F7EFD +3D20F317407799FC2002CA1A005A2F5CDBFE3A082AD7BA59D08F04270EF9B88C +0DF140BB506529B02B8A79B1E32040D7B4515E690EB2C8F32B7F74DD0E821719 0