-
Notifications
You must be signed in to change notification settings - Fork 3.9k
/
Copy pathfunc_dep.go
2028 lines (1865 loc) · 68.4 KB
/
func_dep.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package props
import (
"fmt"
"strings"
"github.com/cockroachdb/cockroach/pkg/sql/opt"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
)
// FuncDepSet is a set of functional dependencies (FDs) that encode useful
// relationships between columns in a base or derived relation. Given two sets
// of columns A and B, a functional dependency A-->B holds if A fully determines
// B. In other words, if two different rows have equal values for columns in A,
// then those two rows will also have equal values for columns in B. For
// example, where columns (a1, a2) are in set A, and column (b1) is in set B:
//
// a1 a2 b1
// --------
// 1 2 5
// 1 2 5
// 3 4 6
// 3 4 6
//
// The left side of a functional dependency is called the "determinant", and
// the right side is called the "dependant". Each side can contain zero or more
// columns, though the FuncDepSet library will fold away certain combinations
// that don't provide useful information, like A-->A and A-->(), since every
// column trivially determines itself, as well as the empty set.
//
// When a dependant contains multiple columns, it is equivalent to splitting
// the single FD into multiple FDs, each with a single column dependant:
//
// (a)-->(b,c)
//
// is equivalent to these two FDs:
//
// (a)-->(b)
// (a)-->(c)
//
// When a determinant contains zero columns, as in ()-->A, then A is fully
// determined without reference to any other columns. An equivalent statement is
// that any arbitrary combination of determinant columns trivially determines A.
// And both of these statements are just another way of saying that columns in A
// are constant:
//
// a1 a2 b1 c1
// ----------------
// 1 NULL 3 3
// 1 NULL 3 NULL
// 1 NULL 4 NULL
//
// When a determinant contains multiple columns, then the functional dependency
// holds for the *composite* value of those columns. For example:
//
// a1 a2 b1
// --------
// 1 2 5
// 1 2 5
// 1 3 4
//
// These are valid values, even though a1 has the same values for all three
// rows, because it is only the combination of (a1,a2) that determines (b1).
//
// Multiple FDs can be transitively applied in order to compute the "closure" of
// a set of input columns. The closure includes the input columns plus all
// columns that are functionally dependent on those columns, either directly or
// indirectly. Consider this set of FD's:
//
// (a)-->(b,c,d)
// (b,c,e)-->(f)
// (d)-->(e)
//
// The transitive closure of (a) is (a,b,c,d,e,f). To start, (a) determines
// (b,c,d). From there, (d) transitively determines (e). And now that (b,c,e)
// have been determined, they in turn determine (f). Because (a) determines all
// other columns, if two rows have the same value for (a), then the rows will be
// duplicates, since all other columns will be equal. And if there are no
// duplicate rows, then (a) is a key for the relation.
//
// Deriving FD Sets
//
// Base table primary keys can be trivially mapped into an FD set, since the
// primary key always uniquely determines the other columns:
//
// CREATE TABLE t (a INT PRIMARY KEY, b INT, c INT)
// (a)-->(b,c)
//
// Each SQL relational operator derives its own FD set from the FD sets of its
// inputs. For example, the Select operator augments the FD set of its input,
// based on its filter condition:
//
// SELECT * FROM t WHERE a=1
//
// Equating a column to a constant value constructs a new FD with an empty
// determinant, so that the augmented FD set becomes:
//
// (a)-->(b,c)
// ()-->(a)
//
// Since the value of column "a" is always the same, and since "a" functionally
// determines "b" and "c", the values of all columns are constants. Furthermore,
// because "a" is known to be a key, the result set can have at most one row.
//
// This is but one example of how FDs can assist the optimizer in proving useful
// properties about query results. This information powers many optimizations,
// including eliminating unnecessary DISTINCT operators, simplifying ORDER BY
// columns, removing Max1Row operators, and mapping semi-joins to inner-joins.
//
// NULL Values
//
// FDs become more complex when the possibility of NULL values is introduced.
// SQL semantics often treat a NULL value as an "unknown" value that is not
// equal to any other value, including another NULL value. For example, SQL
// unique indexes exhibit this behavior:
//
// CREATE TABLE t (a INT PRIMARY KEY, b INT, c INT, UNIQUE (b))
//
// Here, "b" column values are unique...except for the case of multiple NULL
// values, which are allowed because each NULL is treated as if it was a
// different value. Contrast this with the different NULL handling rules used
// by SQL's GROUP BY and DISTINCT operators. Those operators treat multiple NULL
// values as duplicates, because each NULL is treated as if it was the same
// value.
//
// The functional dependencies described up until now always use the "NULLs are
// equal" semantics (denoted as NULL= hereafter) in order to answer the question
// "are these two columns equal". The semantics are identical to what this SQL
// expression returns:
//
// ((c1 = c2) OR (c1 IS NULL AND c2 IS NULL)) IS True
//
// And here are some examples:
//
// c1 c2 NULL=
// -----------------
// 1 1 true
// NULL NULL true
// 1 2 false
// 1 NULL false
// NULL 1 false
//
// So now for the definition of A-->B that incorporates NULL values:
//
// for any two rows r1 and r2 in the relation:
// A(r1) NULL= A(r2) ==> B(r1) NULL= B(r2)
//
// Intuitively, if two different rows have equal values for A using "NULLs are
// equal" semantics, then those rows will also have equal values for B using
// those same semantics. As an example, the following sets of rows would be
// valid for the dependency (b)-->(c):
//
// b c
// ----------
// 1 NULL
// 1 NULL
// NULL 1
// NULL 1
// 2 3
// 2 3
//
// b c
// ----------
// NULL NULL
// NULL NULL
//
// but these sets of rows would be invalid:
//
// b c
// ----------
// NULL 1
// NULL NULL
//
// b c
// ----------
// NULL 1
// NULL 2
//
// Unique constraints allow the latter cases, however, and therefore it is
// desirable to somehow encode these weaker dependencies as FDs, because they
// can be strengthened later on if NULL values are filtered from determinant
// columns (more on that below).
//
// The solution is to store an extra "strict" bit on each FD. If true, then the
// the FD is a "strict" dependency, and behaves as described above. However, if
// false, then the FD is a "lax" dependency. Lax dependencies use "squiggly"
// arrow notation to differentiate them from the strict variant:
//
// A~~>B
//
// In contrast to strict dependencies, lax dependencies treat NULLs on
// determinant columns as distinct from one another, with equality semantics
// identical to this SQL expression:
//
// (c1 = c2) IS True
//
// In other words, if either c1 or c2 is NULL, or both are NULL, then c1 is
// considered not equal to c2. The definition for A~~>B follows from that:
//
// for any two rows r1 and r2 in the relation:
// (A(r1) = A(r2)) IS True ==> B(r1) NULL= B(r2)
//
// In other words, if two different non-NULL rows have equal values for A, then
// those rows will also have equal values for B using NULL= semantics. Note that
// both strict and lax equality definitions collapse to the same semantics when
// the columns of A are not-NULL. The example row sets shown above that were
// invalid for a strict dependency are valid for a lax dependency:
//
// b c
// ----------
// NULL 1
// NULL NULL
//
// b c
// ----------
// NULL 1
// NULL 2
//
// To continue the CREATE TABLE example shown above, another FD can now be
// derived from that statement, in addition to the primary key FD:
//
// (a)-->(b,c)
// (b)~~>(a,c)
//
// Lax dependencies are *not* transitive, and have limited usefulness as-is.
// However, some operators (like Select) can "reject" NULL values, which means
// that they filter out rows containing the troublesome NULL values. That makes
// it possible for the operator to "upgrade" a lax dependency to a strict
// dependency (recall that the both have identical semantics when NULLs are not
// present), as in this example:
//
// SELECT * FROM t WHERE b>5
//
// The ">" operator rejects NULL values, which means that the Select operator
// can convert the lax dependency to a strict dependency:
//
// (a)-->(b,c)
// (b)-->(a,c)
//
// Now, either the "a" or "b" column determines the values of all other columns,
// and both are keys for the relation.
//
// Another thing to note is that a lax dependency with an empty determinant is
// the same as the corresponding strict dependency:
//
// ()~~>(a,b)
// ()-->(a,b)
//
// As described above, a strict dependency differs from a lax dependency only in
// terms of what values are allowed in determinant columns. Since the
// determinant has no columns in these cases, the semantics will be identical.
// For that reason, this library automatically maps lax constant dependencies to
// strict constant dependencies.
//
// Keys
//
// A key is a set of columns that have a unique composite value for every row in
// the relation. There are two kinds of keys, strict and lax, that parallel the
// two kinds of functional dependencies. Strict keys treat NULL values in key
// columns as equal to one another:
//
// b c
// --------
// 1 10
// 2 20
// NULL 30
//
// Here, "b" is a key for the relation, even though it contains a NULL value,
// because there is only one such value. Multiple NULL values would violate the
// strict key, because they would compare as equal, and therefore would be
// considered duplicates. The SQL GROUP BY operator uses the same semantics for
// grouping (it's no coincidence that the definition for strict keys follows
// that lead).
//
// By contrast, lax keys treat NULL values in key columns as distinct from one
// another, and so considers column "b" as unique in the following example:
//
// b c
// --------
// 1 10
// 2 20
// NULL 30
// NULL 40
//
// Note that both strict and lax keys treat non-NULL values identically; values
// from two different rows must never compare equal to one another. In addition,
// the presence of a strict or lax key always implies a functional dependency
// with the key as determinant and all other columns in the relation as
// dependants. Here is an example assuming a table with columns (a,b,c,d):
//
// lax-key(a,b) => (a,b)~~>(c,d)
// strict-key(a,b) => (a,b)-->(c,d)
//
// The "empty key" is a special key that has zero columns. It is used when the
// relation is guaranteed to have at most one row. In this special case, every
// column is constant. Every combination of columns is a trivial key for the
// relation and determines every other column. Because the lax and strict key
// designations are equivalent when there is a single row, empty keys are always
// normalized to be strict for convenience.
//
// FuncDepSet tracks whether at least one key (whether it be strict or lax)
// exists for the relation. If this is true, then all possible keys for the
// relation can be enumerated using the FD set. This is because any subset of
// columns forms a key if its FD closure contains every column in the relation.
// Therefore, all keys can be brute force enumerated by checking the closure of
// each combination in the power set. Again, this is only possible when the
// relation is known to have a key; otherwise, knowing the closure contains all
// columns is not a sufficient condition to identify a key, because of the
// possibility of duplicate rows.
//
// In practice, it is never necessary to enumerate all possible keys (fortunate,
// since there can be O(2**N) of them), since the vast majority of them turn out
// to have redundant columns that can be functionally determined from other
// columns in the key. Of more value is the set of "candidate keys", which are
// keys that contain no redundant columns. Removing any column from such a key
// causes it to no longer be a key. It is possible to enumerate the set of
// candidate keys in polynomial rather than exponential time (see Wikipedia
// "Candidate key" entry).
//
// However, since even polynomial time can be problematic, this library tries to
// avoid enumerating keys by storing and maintaining a single candidate key for
// the relation. And while it is not always successful, the library tries to
// keep the candidate key that has the fewest number of columns. In most cases,
// this single key is enough to satisfy the requirements of the optimizer. But
// when it is not enough, or the existing key is no longer valid, then a new
// candidate key can always be generated.
//
// It turns out that the most common key-related question that must be answered
// is not "what are the list of keys for this relation?", but instead, "does
// this set of columns contain a key for the relation?". The latter question can
// be easily answered by computing the closure of the columns, and checking
// whether the closure contains the key maintained by FuncDepSet. And when a
// relatively short key is needed (e.g. during decorrelation), FuncDepSet has
// one ready to go.
//
// Equivalent Columns
//
// FD sets encode "equivalent columns", which are pairs of columns that always
// have equal values using the SQL equality operator with NULL= semantics. Two
// columns a and b are equivalent if the following expression returns true:
//
// ((a = b) OR (a IS NULL AND b IS NULL)) IS True
//
// Equivalent columns are typically derived from a Select filter condition, and
// are represented as two FDs with each column acting as both determinant and
// dependant:
//
// SELECT * FROM t WHERE b=c
// (a)-->(b,c)
// (b)~~>(a,c)
// (b)==(c)
// (c)==(b)
//
// In the common case shown above, the WHERE clause rejects NULL values, so the
// equivalency will always be strict, which means it retains all the same
// properties of a strict dependency. While lax equivalencies are theoretically
// possible, the library currently maps them into regular lax dependencies to
// simplify implementation.
//
// Theory to Practice
//
// For a more rigorous examination of functional dependencies and their
// interaction with various SQL operators, see the following Master's Thesis:
//
// Norman Paulley, Glenn. (2000).
// Exploiting Functional Dependence in Query Optimization.
// https://cs.uwaterloo.ca/research/tr/2000/11/CS-2000-11.thesis.pdf
//
// While this paper served as the inspiration for this library, a number of
// details differ, including (but not limited to):
//
// 1. Most importantly, the definition of "lax" used in the paper differs from
// the definition used by this library. For a lax dependency A~~>B, the
// paper allows this set of rows:
//
// a b
// -------
// 1 1
// 1 NULL
//
// This library disallows that, since it requires that if the determinant
// of a lax dependency is not-null, then it is equivalent to a strict
// dependency. This alternate definition is briefly covered in section
// 2.5.3.2 of the paper (see definition 2.19). The reason for this change
// is to allow a lax dependency to be upgraded to a strict dependency more
// readily, needing only the determinant columns to be not-null rather than
// both determinant and dependant columns.
//
// 2. The paper simplifies FD sets so that dependants never contain more than
// one column. This library allows multiple dependent columns, since they
// can be so efficiently stored and processed as ColSets.
//
// 3. The paper deliberately avoids all simplifications when a SQL operator
// adds new FDs to an existing FD set, in order to avoid unneeded work and
// expensive reductions. This library performs quite a few simplifications
// in order to keep the FD set more manageable and understandable.
//
// 4. The paper "colors" columns black when they are no longer part of a
// derived relation. Rather than marking removed columns, this library
// actually removes them from the FD set.
//
// 5. In order to ensure a unique key for every relation, the paper uses a
// special "tuple identifier" that acts like a virtual column and can be
// both a determinant and a dependant. If the transitive closure of any set
// of columns includes the tuple identifier column, then that set of
// columns is a super key for the relation. As described in the Keys
// section above, this library takes a simplified approach so that it
// doesn't need to allocate virtual columns in property derivation code.
//
type FuncDepSet struct {
// deps contains the functional dependencies that have a non-trivial
// determinant and dependant (i.e. not empty, with no overlapping columns):
//
// (a)-->(b,c)
// (b,c)~~>(a,d)
// (d)==(e)
// (e)==(d)
//
// See the above comments for more details.
//
// This slice is owned by this FuncDepSet and shouldn't be shared unless
// all referencing sets are treated as immutable.
deps []funcDep
// hasKey is:
// - strictKey if the relation has no duplicate rows, which means at least
// one subset of its columns form a key (all columns, if no other subset).
// The key field contains one such key. See the "Keys" section above for
// more details. A strict key can be empty.
// - laxKey if there is a at least one subset of columns that form a lax key.
// The key field contains one such key. A lax key cannot be empty.
//
// See the "Keys" section above for more details.
hasKey keyType
// key contains a set of columns that form a key or a lax key for the
// relation, depending on hasKey; empty if hasKey is noKey.
//
// There is no guarantee that the key has the minimum possible number of
// columns, but a best effort is made to keep it as short as possible.
//
// See the "Keys" section above for more details.
//
// This set is immutable; to update it, replace it with a different set
// containing the desired columns.
key opt.ColSet
}
type keyType int8
const (
noKey keyType = iota
laxKey
strictKey
)
// funcDep stores a single functional dependency. See the comment for FuncDepSet
// for more details.
type funcDep struct {
// from is the determinant of the functional dependency (easier to read the
// code when "from" is used rather than "determinant"). If equiv is true,
// from may only consist of a single column.
//
// This set is immutable; to update it, replace it with a different set
// containing the desired columns.
from opt.ColSet
// to is the dependant of the functional dependency (easier to read the code
// when "to" is used rather than "dependant").
//
// This set is immutable; to update it, replace it with a different set
// containing the desired columns.
to opt.ColSet
// strict is true if NULL values in the determinant are treated as if they are
// equal to other NULL values. Every NULL determinant must therefore map to
// the same dependant value. If strict is false, then two NULL determinants
// can map to different dependant values. See the NULL Values section in the
// FuncDeps comment for more details.
strict bool
// equiv is true if the value of the determinant equals the value of each of
// the dependant columns, and false if there's no known equality relationship.
// If equiv is true, the determinant may only consist of a single column.
equiv bool
}
// StrictKey returns a strict key for the relation, if there is one.
// A best effort is made to return a candidate key that has the fewest columns.
func (f *FuncDepSet) StrictKey() (_ opt.ColSet, ok bool) {
if f.hasKey == strictKey {
return f.key, true
}
return opt.ColSet{}, false
}
// LaxKey returns a lax key for the relation, if there is one.
// Note that strict keys are implicitly also lax keys, so if the relation has a
// strict key, this returns the same key as StrictKey().
// A best effort is made to return a lax key that has the fewest columns.
func (f *FuncDepSet) LaxKey() (_ opt.ColSet, ok bool) {
if f.hasKey != noKey {
return f.key, true
}
return opt.ColSet{}, false
}
// Empty is true if the set contains no FDs and no key.
func (f *FuncDepSet) Empty() bool {
return len(f.deps) == 0 && f.hasKey == noKey
}
// ColSet returns all columns referenced by the FD set.
func (f *FuncDepSet) ColSet() opt.ColSet {
var cols opt.ColSet
for i := 0; i < len(f.deps); i++ {
fd := &f.deps[i]
cols.UnionWith(fd.from)
cols.UnionWith(fd.to)
}
if f.hasKey != noKey {
// There are cases where key columns don't show up in FDs. For example:
// lax-key(2,3); ()-->(1)
cols.UnionWith(f.key)
}
return cols
}
// HasMax1Row returns true if the relation has zero or one rows.
func (f *FuncDepSet) HasMax1Row() bool {
return f.hasKey == strictKey && f.key.Empty()
}
// CopyFrom copies the given FD into this FD, replacing any existing data.
func (f *FuncDepSet) CopyFrom(fdset *FuncDepSet) {
// Make certain to copy FDs to the slice owned by this set.
f.deps = f.deps[:0]
f.deps = append(f.deps, fdset.deps...)
f.key = fdset.key
f.hasKey = fdset.hasKey
}
// RemapFrom copies the given FD into this FD, remapping column IDs according to
// the given function.
func (f *FuncDepSet) RemapFrom(fdset *FuncDepSet, remap func(opt.ColumnID) opt.ColumnID) {
f.CopyFrom(fdset)
remapColSet := func(in opt.ColSet) (out opt.ColSet) {
for c, ok := in.Next(0); ok; c, ok = in.Next(c + 1) {
out.Add(remap(c))
}
return out
}
for i := range f.deps {
f.deps[i].from = remapColSet(f.deps[i].from)
f.deps[i].to = remapColSet(f.deps[i].to)
}
f.key = remapColSet(f.key)
}
// ColsAreStrictKey returns true if the given columns contain a strict key for the
// relation. This means that any two rows in the relation will never have the
// same values for this set of columns. If the columns are nullable, then at
// most one row could have NULL values for all of the columns. For example,
// (a,b) is a strict key for the following relation, but (a) is not (because
// there are multiple rows where a=1 and a=NULL):
//
// a b c
// ----------------
// NULL NULL NULL
// NULL 1 1
// 1 NULL 1
// 1 1 1
//
func (f *FuncDepSet) ColsAreStrictKey(cols opt.ColSet) bool {
return f.colsAreKey(cols, strictKey)
}
// ColsAreLaxKey returns true if the given columns contain a lax key for the
// relation. This means that any two rows in the relation will never have the
// same values for this set of columns, except potentially in the case where at
// least one of the columns is NULL. For example, (a,b) is a lax key for the
// following relation, but (a) is not (because there are multiple rows where
// a=1):
//
// a b c
// ----------------
// NULL NULL NULL
// NULL NULL 1
// NULL NULL 2
// NULL 1 1
// NULL 1 2
// 1 NULL 1
// 1 NULL 2
// 1 1 1
//
func (f *FuncDepSet) ColsAreLaxKey(cols opt.ColSet) bool {
return f.colsAreKey(cols, laxKey)
}
// ConstantCols returns the set of columns that will always have the same value
// for all rows in the relation.
func (f *FuncDepSet) ConstantCols() opt.ColSet {
if len(f.deps) > 0 && f.deps[0].isConstant() {
return f.deps[0].to
}
return opt.ColSet{}
}
// ReduceCols removes redundant columns from the given set. Redundant columns
// can be functionally determined from the remaining columns. If the columns
// contain a key for the relation, then the reduced columns will form a
// candidate key for the relation.
//
// The reduction algorithm removes one column at a time (in an arbitrary order),
// and then tests to see if the closure still includes the removed column. If
// so, then the column is redundant. This algorithm has decent running time, but
// will not necessarily find the candidate key with the fewest columns.
func (f *FuncDepSet) ReduceCols(cols opt.ColSet) opt.ColSet {
var removed opt.ColSet
cols = cols.Copy()
for i, ok := cols.Next(0); ok; i, ok = cols.Next(i + 1) {
cols.Remove(i)
removed.Add(i)
if !f.inClosureOf(removed, cols, true /* strict */) {
// The column is not functionally determined by the other columns, so
// retain it in the set.
cols.Add(i)
}
removed.Remove(i)
}
return cols
}
// InClosureOf returns true if the given columns are functionally determined by
// the "in" column set.
func (f *FuncDepSet) InClosureOf(cols, in opt.ColSet) bool {
return f.inClosureOf(cols, in, true /* strict */)
}
// ComputeClosure returns the strict closure of the given columns. The closure
// includes the input columns plus all columns that are functionally dependent
// on those columns, either directly or indirectly. Consider this set of FD's:
//
// (a)-->(b,c,d)
// (b,c,e)-->(f)
// (d)-->(e)
//
// The strict closure of (a) is (a,b,c,d,e,f), because (a) determines all other
// columns. Therefore, if two rows have the same value for (a), then the rows
// will be duplicates, since all other columns will be equal.
func (f *FuncDepSet) ComputeClosure(cols opt.ColSet) opt.ColSet {
cols = cols.Copy()
for i := 0; i < len(f.deps); i++ {
fd := &f.deps[i]
if fd.strict && fd.from.SubsetOf(cols) && !fd.to.SubsetOf(cols) {
cols.UnionWith(fd.to)
// Restart iteration to get transitive closure.
i = -1
}
}
return cols
}
// AreColsEquiv returns true if the two given columns are equivalent.
func (f *FuncDepSet) AreColsEquiv(col1, col2 opt.ColumnID) bool {
for i := range f.deps {
fd := &f.deps[i]
if fd.equiv && fd.strict {
if (fd.from.Contains(col1) && fd.to.Contains(col2)) ||
(fd.from.Contains(col2) && fd.to.Contains(col1)) {
return true
}
}
}
return false
}
// ComputeEquivClosure returns the equivalence closure of the given columns. The
// closure includes the input columns plus all columns that are equivalent to
// any of these columns, either directly or indirectly. For example:
//
// (a)==(b)
// (b)==(c)
// (a)==(d)
//
// The equivalence closure for (a) is (a,b,c,d) because (a) is transitively
// equivalent to all other columns. Therefore, all columns must have equal
// non-NULL values, or else all must be NULL (see definition for NULL= in the
// comment for FuncDepSet).
func (f *FuncDepSet) ComputeEquivClosure(cols opt.ColSet) opt.ColSet {
// Don't need to get transitive closure, because equivalence closures are
// already maintained for every column.
cols = cols.Copy()
for i := range f.deps {
fd := &f.deps[i]
if fd.equiv && fd.from.SubsetOf(cols) && !fd.to.SubsetOf(cols) {
cols.UnionWith(fd.to)
}
}
return cols
}
// AddStrictKey adds an FD for a new key. The given key columns are reduced to a
// candidate key, and that becomes the determinant for the allCols column set.
// The resulting FD is strict, meaning that a NULL key value always maps to the
// same set of values in the rest of the relation's columns. For key columns
// (a,b) and relation columns (a,b,c,d), an FD like this is created:
//
// (a,b)-->(c,d)
//
// If the resulting candidate key has fewer columns than the current key, then
// the new key is adopted in its place.
func (f *FuncDepSet) AddStrictKey(keyCols, allCols opt.ColSet) {
if !keyCols.SubsetOf(allCols) {
panic(errors.AssertionFailedf("allCols does not include keyCols"))
}
// Ensure we have candidate key (i.e. has no columns that are functionally
// determined by other columns).
keyCols = f.ReduceCols(keyCols)
f.addDependency(keyCols, allCols, true /* strict */, false /* equiv */)
// Try to use the new FD to reduce any existing key first.
f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
if f.hasKey != strictKey || keyCols.Len() < f.key.Len() {
f.setKey(keyCols, strictKey)
}
}
// AddLaxKey is similar to AddStrictKey, except that it creates a lax FD rather
// than a strict FD. This means that two rows with NULL key values might not
// have the same values in other non-key columns. For key columns (a,b) and
// relation columns (a,b,c,d), and FD like this is created:
//
// (a,b)~~>(c,d)
//
func (f *FuncDepSet) AddLaxKey(keyCols, allCols opt.ColSet) {
if !keyCols.SubsetOf(allCols) {
panic(errors.AssertionFailedf("allCols does not include keyCols"))
}
if keyCols.Empty() {
panic(errors.AssertionFailedf("lax key cannot be empty"))
}
// Ensure we have candidate key (i.e. has no columns that are functionally
// determined by other columns).
f.addDependency(keyCols, allCols, false /* strict */, false /* equiv */)
// TODO(radu): without null column information, we cannot reduce lax keys (see
// tryToReduceKey). Consider passing that information (or storing it with the
// FDs to begin with). In that case we would need to reduce both the given key
// and the existing key, similar to AddStrictKey.
if f.hasKey == noKey || (f.hasKey == laxKey && keyCols.Len() < f.key.Len()) {
f.setKey(keyCols, laxKey)
}
}
// MakeMax1Row initializes the FD set for a relation containing either zero or
// one rows, and with the given columns. In this special case, the value of
// every column is trivially considered a constant, and the key is the empty
// set, because no columns are required to ensure uniqueness of rows. This
// special case may seem trivial, but it is quite important to detect during
// optimization. For a relation with columns (a, b), the following FD is
// created in the set:
//
// ()-->(a,b)
//
// If f has equivalence dependencies of columns that are a subset of cols, those
// dependencies are retained in f. This prevents losing additional information
// about the columns, which a single FD with an empty key cannot describe. For
// example:
//
// f: (a)-->(b,c), (a)==(b), (b)==(a), (a)==(c), (c)==(a)
// cols: (a,c)
// result: ()-->(a,c), (a)==(c), (c)==(a)
//
func (f *FuncDepSet) MakeMax1Row(cols opt.ColSet) {
// Remove all FDs except for equivalency FDs with columns that are a subset
// of cols.
copyIdx := 0
for i := range f.deps {
fd := &f.deps[i]
// Skip non-equivalence dependencies.
if !fd.equiv {
continue
}
// If fd is an equivalence dependency, from has a single column.
fromCol, ok := fd.from.Next(0)
if !ok {
panic(errors.AssertionFailedf("equivalence dependency from cannot be empty"))
}
// Add a new equivalence dependency with the same from column and the to
// columns that are present in cols.
if cols.Contains(fromCol) && fd.to.Intersects(cols) {
f.deps[copyIdx] = funcDep{
from: fd.from,
to: fd.to.Intersection(cols),
strict: true,
equiv: true,
}
copyIdx++
}
}
f.deps = f.deps[:copyIdx]
if !cols.Empty() {
f.deps = append(f.deps, funcDep{to: cols, strict: true})
// The constant FD must be the first in the set.
f.deps[0], f.deps[len(f.deps)-1] = f.deps[len(f.deps)-1], f.deps[0]
}
f.setKey(opt.ColSet{}, strictKey)
}
// MakeNotNull modifies the FD set based on which columns cannot contain NULL
// values. This often allows upgrading lax dependencies to strict dependencies,
// and lax keys to strict keys.
//
// Note: this function should be called with all known null columns; it won't do
// as good of a job if it's called multiple times with different subsets.
func (f *FuncDepSet) MakeNotNull(notNullCols opt.ColSet) {
// We have to collect all the FDs that can be made strict. We avoid allocation
// for the case where there is at most one such FD.
var firstLaxFD *funcDep
var otherLaxFDs []funcDep
for i := range f.deps {
fd := &f.deps[i]
if fd.strict {
continue
}
// FD can be made strict if all determinant columns are not null.
if fd.from.SubsetOf(notNullCols) {
if firstLaxFD == nil {
firstLaxFD = fd
} else {
otherLaxFDs = append(otherLaxFDs, *fd)
}
}
}
if firstLaxFD != nil {
f.addDependency(firstLaxFD.from, firstLaxFD.to, true /* strict */, false /* equiv */)
for i := range otherLaxFDs {
f.addDependency(otherLaxFDs[i].from, otherLaxFDs[i].to, true /* strict */, false /* equiv */)
}
}
f.tryToReduceKey(notNullCols)
}
// AddEquivalency adds two FDs to the set that establish a strict equivalence
// between the given columns. Either "a" equals "b" according to SQL equality
// semantics, or else "a" is NULL and "b" is NULL. The following FDs are
// created in the set:
//
// (a)==(b)
// (b)==(a)
//
func (f *FuncDepSet) AddEquivalency(a, b opt.ColumnID) {
if a == b {
return
}
var equiv opt.ColSet
equiv.Add(a)
equiv.Add(b)
f.addEquivalency(equiv)
}
// AddConstants adds a strict FD to the set that declares the given column as
// having the same constant value for all rows. If the column is nullable, then
// its value may be NULL, but then the column must be NULL for all rows. For
// column "a", the FD looks like this:
//
// ()-->(a)
//
// Since it is a constant, any set of determinant columns (including the empty
// set) trivially determines the value of "a".
func (f *FuncDepSet) AddConstants(cols opt.ColSet) {
if cols.Empty() {
return
}
// Determine complete set of constants by computing closure.
cols = f.ComputeClosure(cols)
// Ensure that first FD in the set is a constant FD and make sure the
// constants are part of it.
if len(f.deps) == 0 || !f.deps[0].isConstant() {
deps := make([]funcDep, len(f.deps)+1)
deps[0] = funcDep{to: cols, strict: true}
copy(deps[1:], f.deps)
f.deps = deps
} else {
// Update existing constant FD to include all constant columns in the set.
f.deps[0].to = cols
}
// Remove any other FDs made redundant by adding the constants.
n := 1
for i := 1; i < len(f.deps); i++ {
fd := &f.deps[i]
// Always retain equivalency information, even for constants.
if !fd.equiv {
if fd.strict {
// Constant columns can be removed from the determinant of a strict
// FD. If all determinant columns are constant, then the entire FD
// can be removed, since this means that the dependant columns must
// also be constant (and were part of constant closure added to the
// constant FD above).
if !fd.removeFromCols(cols) {
continue
}
}
// Dependant constants are redundant, so remove them.
if !fd.removeToCols(cols) {
continue
}
}
if n != i {
f.deps[n] = f.deps[i]
}
n++
}
f.deps = f.deps[:n]
f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
}
// AddSynthesizedCol adds an FD to the set that is derived from a synthesized
// column in a projection list. The synthesized column is often derived from
// other columns, in which case AddSynthesizedCol creates a new FD like this:
//
// (a,b)-->(c)
//
// Or it may be a constant column, like this:
//
// ()-->(c)
//
func (f *FuncDepSet) AddSynthesizedCol(from opt.ColSet, col opt.ColumnID) {
if from.Contains(col) {
panic(errors.AssertionFailedf("synthesized column cannot depend upon itself"))
}
var colSet opt.ColSet
colSet.Add(col)
f.addDependency(from, colSet, true /* strict */, false /* equiv */)
f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
}
// ProjectCols removes all columns that are not in the given set. It does this
// by replacing any un-projected dependants by their closures, and then removing
// all FDs containing un-projected columns. While this algorithm may cause some
// loss of information in edge cases, it does a good job of preserving the most
// important dependency information.
func (f *FuncDepSet) ProjectCols(cols opt.ColSet) {
// Ensure that any existing key contains only projected columns. Do this
// before removing any FDs from the set, in order to take advantage of all
// existing transitive relationships.
if f.hasKey != noKey && !f.key.SubsetOf(cols) {
// Derive new candidate key (or key is no longer possible).
if f.hasKey == strictKey && f.ColsAreStrictKey(cols) {
f.setKey(cols, strictKey)
f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
} else if f.ColsAreLaxKey(cols) {
f.setKey(cols, laxKey)
f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
} else {
f.clearKey()
}
}
// Special case of <= 1 row.
if f.hasKey == strictKey && f.key.Empty() {
f.MakeMax1Row(cols)
return
}