Skip to content

Commit

Permalink
Starting on Rename
Browse files Browse the repository at this point in the history
Align Column_Mapping

Add By_Position
Separating off the validation for By_Index so can reuse for rename

By_Position implemented

By_Index implemented
Adjusted behaviour following discussion with Ned, so that renames dominate untouched columns.

Moving to validation style checks for problems

Putting accumulator back

Rename work
  • Loading branch information
jdunkerley committed Feb 7, 2022
1 parent c174fcd commit 481f0f5
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 47 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from Standard.Base import all

from Standard.Table.Data.Matching import Matching_Strategy, Exact

## Specifies a selection of columns from the table and the new name for them to
become.
type Column_Mapping

## Selects columns based on their names.

The `matching_strategy` can be used to specify if the names should be
matched exactly or should be treated as regular expressions. It also
allows to specify if the matching should be case-sensitive.
type By_Name (names : Map Text Text) (matching_strategy : Matching_Strategy = Exact True)

## Selects columns by their index.

The index of the first column in the table is 0. If the provided index is
negative, it counts from the end of the table (e.g. -1 refers to the last
column in the table).
type By_Index (indexes : Map Number Text)

## Selects columns having exactly the same names as the columns provided in
the input.

The input columns do not necessarily have to come from the same table, so
this approach can be used to match columns with the same names as a set
of columns of some other table, for example, when preparing for a join.
type By_Column (columns : Map Column Text)

## Selects columns by position starting at the first column until the
new_names is exhausted.
type By_Position (new_names : Vector Text)

## UNSTABLE
A temporary workaround to allow the By_Name constructor to work with default arguments.
By_Name.new : Map Text Text -> Matching_Strategy -> By_Name
By_Name.new names (matching_strategy = Exact.new) = By_Name names matching_strategy
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ from Standard.Table.Data.Column_Selector as Column_Selector_Module import Column
from Standard.Table.Data.Sort_Method as Sort_Method_Module import Sort_Method
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
import Standard.Table.Data.Position
from Standard.Table.Error as Error_Module import Missing_Input_Columns, Column_Indexes_Out_Of_Range, No_Output_Columns, Duplicate_Column_Selectors, Input_Indices_Already_Matched
from Standard.Table.Error as Error_Module import Missing_Input_Columns, Column_Indexes_Out_Of_Range, No_Output_Columns, Duplicate_Column_Selectors, Input_Indices_Already_Matched, Too_Many_Column_Names_Provided, Duplicate_Output_Column_Names
import Standard.Table.Data.Column_Mapping
import Standard.Table.Internal.Unique_Name_Strategy
import Standard.Base.Data.Ordering.Natural_Order

## PRIVATE
Expand Down Expand Up @@ -112,6 +114,64 @@ reorder_columns internal_columns selector position on_problems warnings =
Position.After_Other_Columns -> other_columns + selection
result

rename_columns : Vector -> Column_Mapping -> Problem_Behavior -> Warnings.Warning_System -> Map
rename_columns internal_columns mapping on_problems warnings =
# ToDo: Use .name on internal_columns
# ToDo: RegEx Baby
# ToDo: Invalid Name Errors ==> Column

unique = Unique_Name_Strategy.new
renames = Vector.new_builder

make_unique = target ->
new_target = unique.make_unique target
if target != new_target then renames.append target
new_target

col_count = internal_columns.length

mapped = case mapping of
Column_Mapping.By_Column map ->
output = here.rename_columns internal_columns (Column_Mapping.By_Name (map.transform k->v->[k.Name, v]) (Matching.Exact case_sensitivity=True)) on_problems warnings
Validation_Result output []
Column_Mapping.By_Name map ms ->
keys = map.keys
mapper = name->
index = keys.find k->(Matching.match_single_criterion name k ms)
case index of
Nothing -> Nothing
_ -> make_unique (map.get (keys.at index))

new_names = 0.up_to col_count . map i->(mapper (internal_columns.at i))
Validation_Result new_names []
Column_Mapping.By_Index map ->
validation = here.validate_indices col_count map.keys
good_indices = validation.valid

index_map = Map.from_vector <| good_indices.map i->[i, map.get_or_else i (map.get (i - col_count))]

new_names = 0.up_to col_count . map i->
target = index_map.get_or_else i Nothing
if target.is_nothing then target else make_unique target

Validation_Result new_names validation.problems
Column_Mapping.By_Position vec ->
validation = case vec.length > col_count of
True -> Validation_Result (vec.take_start col_count) [Too_Many_Column_Names_Provided vec.drop_start col_count]
False -> Validation_Result vec []
good_names = validation.valid

new_names = 0.up_to col_count . map i->if i < good_names.length then good_names.at i else make_unique (internal_columns.at i)
Validation_Result new_names validation.problems

processed = mapped.valid.map_with_index i->n->
if n.is_nothing then (make_unique (internal_columns.at i)) else n

problems = mapped.problems + (if renames.length == 0 then [] else [Duplicate_Output_Column_Names renames.to_vector])
on_problems.attach_problems_before problems warnings processed



## PRIVATE
A helper function encapsulating shared code for `sort_columns`
implementations of various Table variants. See the documentation for the
Expand Down Expand Up @@ -163,37 +223,15 @@ sort_columns internal_columns sort_method =
select_columns_helper : Vector -> Column_Selector -> Boolean -> Problem_Behavior -> Warnings.Warning_System -> Vector
select_columns_helper internal_columns selector reorder on_problems warnings = case selector of
By_Name names matching_strategy ->
split_result = here.split_to_distinct_and_duplicates names
distinct_names = split_result.first
duplicate_names = split_result.second
problems = if duplicate_names.is_empty then [] else
[Duplicate_Column_Selectors duplicate_names]
on_problems.attach_problems_before problems warnings <|
validation = here.validate_unique names v->[Duplicate_Column_Selectors v]
on_problems.attach_problems_before validation.problems warnings <|
Warnings.map_warnings_and_errors here.promote_no_matches_to_missing_columns warnings warnings->
Matching.match_criteria internal_columns distinct_names reorder=reorder name_mapper=(_.name) matching_strategy=matching_strategy on_problems=on_problems warnings=warnings
Matching.match_criteria internal_columns validation.valid reorder=reorder name_mapper=(_.name) matching_strategy=matching_strategy on_problems=on_problems warnings=warnings
By_Index indices ->
partitioned_indices = indices.partition (here.is_index_valid internal_columns)
inbound_indices = partitioned_indices.first
oob_indices = partitioned_indices.second

split_result = here.split_to_distinct_and_duplicates inbound_indices
duplicate_indices = split_result.second
distinct_indices = split_result.first

resolved_indices = distinct_indices.map ix-> Pair ix (here.resolve_index internal_columns ix)
alias_split_result = here.split_to_distinct_and_duplicates resolved_indices .second
aliasing_indices = alias_split_result.second.map .first
good_indices = alias_split_result.first.map .second

oob_problems = if oob_indices.is_empty then [] else
[Column_Indexes_Out_Of_Range oob_indices]
duplicate_problems = if duplicate_indices.is_empty then [] else
[Duplicate_Column_Selectors duplicate_indices]
aliasing_problems = if aliasing_indices.is_empty then [] else
[Input_Indices_Already_Matched aliasing_indices]
problems = oob_problems + duplicate_problems + aliasing_problems

on_problems.attach_problems_before problems warnings <| case reorder of
validation = here.validate_indices internal_columns.length indices
good_indices = validation.valid

on_problems.attach_problems_before validation.problems warnings <| case reorder of
True ->
here.select_indices_reordering internal_columns good_indices
False ->
Expand All @@ -208,8 +246,7 @@ select_columns_helper internal_columns selector reorder on_problems warnings = c
`Missing_Input_Columns`. Any other errors are returned as-is.
promote_no_matches_to_missing_columns error = case error of
Matching.No_Matches_Found criteria -> Missing_Input_Columns criteria
_ ->
error
_ -> error

## PRIVATE
Selects element from the vector based on the given indices.
Expand All @@ -236,27 +273,54 @@ select_indices_preserving_order vector indices =
If the negative index is sufficiently large, a negative result can still be
returned. This function does not ensure that the resulting indices are within
bounds.
resolve_index vector ix =
if ix < 0 then vector.length+ix else ix
resolve_index Integer->Integer->Integer
resolve_index length ix =
if ix < 0 then length+ix else ix

## PRIVATE
Checks if the given index is in the valid range for the provided vector.
is_index_valid vector ix =
actual_ix = here.resolve_index vector ix
actual_ix>=0 && actual_ix<vector.length
is_index_valid Integer->Integer->Boolean
is_index_valid length ix =
actual_ix = here.resolve_index length ix
actual_ix>=0 && actual_ix<length

## PRIVATE
Validates a Vector of indices returning a pair of `good_indices` and `problems`
validate_indices : Integer -> Vector -> Validation_Result Vector Vector
validate_indices length indices =
partitioned_indices = indices.partition (here.is_index_valid length)
inbound_indices = partitioned_indices.first
oob_indices = partitioned_indices.second
oob_problems = if oob_indices.is_empty then [] else
[Column_Indexes_Out_Of_Range oob_indices]

uniques = here.validate_unique inbound_indices v->[Duplicate_Column_Selectors v]

resolver = ix->(here.resolve_index length ix)
alias_uniques = here.validate_unique uniques.valid v->[Input_Indices_Already_Matched v] resolver
good_indices = alias_uniques.valid.map resolver

problems = oob_problems + uniques.problems + alias_uniques.problems

type Repeated_Acc distinct_builder duplicate_builder existing
Validation_Result good_indices problems

## PRIVATE
Splits a vector into elements which are distinct and the duplicates.
split_to_distinct_and_duplicates : (Any -> Any) -> Vector -> Pair Vector Vector
split_to_distinct_and_duplicates vector (on = x->x) =
acc = vector.fold (Repeated_Acc Vector.new_builder Vector.new_builder Map.empty) acc-> item->
Duplicates are wrapped as an error
validate_unique : Vector -> (Vector -> Vector) -> (Any -> Any) -> Validation_Result Vector Vector
validate_unique vector problem_wrapper on=(x->x) =
acc = vector.fold [Map.empty, Vector.new_builder, Vector.new_builder] acc-> item->
existing = acc.at 0
key = on item
already_present = acc.existing.get_or_else key False
already_present = existing.get_or_else key False
case already_present of
True ->
Repeated_Acc acc.distinct_builder (acc.duplicate_builder.append item) acc.existing
False ->
Repeated_Acc (acc.distinct_builder.append item) acc.duplicate_builder (acc.existing.insert key True)
Pair acc.distinct_builder.to_vector acc.duplicate_builder.to_vector
True -> [existing, acc.at 1, acc.at 2 . append item]
False -> [existing.insert key True, acc.at 1 . append item, acc.at 2]

duplicates = acc.at 2
problems = if duplicates.length == 0 then [] else problem_wrapper duplicates.to_vector

Validation_Result (acc.at 1).to_vector problems

## PRIVATE
type Validation_Result valid problems
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from Standard.Base import all

new : Unique_Name_Strategy
new = Unique_Name_Strategy.new

type Unique_Name_Strategy
type Unique_Name_Strategy store

new : Unique_Name_Strategy
new = Unique_Name_Strategy Map.empty

make_unique : Text -> Text
make_unique name = this.internal_unique name 0

internal_unique : Text -> Integer -> Text
internal_unique name shift =
inner_name = if shift == 0 then name else (name + "_"+ shift.to_text)
case this.store.get_or_else inner_name False of
False ->
new_store = this.store.insert inner_name True
Unsafe.set_atom_field this 0 new_store
inner_name
True ->
@Tail_Call this.internal_unique name (shift+1)

0 comments on commit 481f0f5

Please sign in to comment.