-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
tc.c
2018 lines (1792 loc) · 63.4 KB
/
tc.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Project Calico BPF dataplane programs.
// Copyright (c) 2020-2023 Tigera, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
#include <linux/types.h>
#include <linux/bpf.h>
#include <linux/pkt_cls.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/in.h>
#include <linux/udp.h>
#include <linux/if_ether.h>
#include <iproute2/bpf_elf.h>
// stdbool.h has no deps so it's OK to include; stdint.h pulls in parts
// of the std lib that aren't compatible with BPF.
#include <stdbool.h>
#include "bpf.h"
#define CALI_IFACE_LOG(fmt, ...) bpf_log("%s" fmt, ctx->globals->data.iface_name, ## __VA_ARGS__)
#define CALI_LOG(fmt, ...) do { \
if (((CALI_COMPILE_FLAGS) & CALI_TC_HOST_EP) && ((CALI_COMPILE_FLAGS) & CALI_TC_INGRESS)) { \
CALI_IFACE_LOG("-I: " fmt, ## __VA_ARGS__); \
} else if ((CALI_COMPILE_FLAGS) & CALI_TC_HOST_EP) { \
CALI_IFACE_LOG("-E: " fmt, ## __VA_ARGS__); \
} else if ((CALI_COMPILE_FLAGS) & CALI_TC_INGRESS) { \
CALI_IFACE_LOG("-I: " fmt, ## __VA_ARGS__); \
} else { \
CALI_IFACE_LOG("-E: " fmt, ## __VA_ARGS__); \
} \
} while (0)
#include "types.h"
#include "counters.h"
#include "skb.h"
#include "policy.h"
#include "conntrack.h"
#include "nat.h"
#include "nat_lookup.h"
#include "routes.h"
#include "jump.h"
#include "reasons.h"
#include "icmp.h"
#include "arp.h"
#include "sendrecv.h"
#include "fib.h"
#include "rpf.h"
#include "parsing.h"
#include "tc.h"
#include "failsafe.h"
#include "metadata.h"
#include "bpf_helpers.h"
#include "rule_counters.h"
#define HAS_HOST_CONFLICT_PROG CALI_F_TO_HEP
/* calico_tc_main is the main function used in all of the tc programs. It is specialised
* for particular hook at build time based on the CALI_F build flags.
*/
SEC("tc")
int calico_tc_main(struct __sk_buff *skb)
{
#ifdef UNITTEST
/* UT-only workaround to allow us to run the program with BPF_TEST_PROG_RUN
* and simulate a specific mark
*/
skb->mark = SKB_MARK;
#endif
if (CALI_F_LO && CALI_F_TO_HOST) {
/* Do nothing, it is a packet that just looped around. */
return TC_ACT_UNSPEC;
}
/* Optimisation: if another BPF program has already pre-approved the packet,
* skip all processing. */
if (CALI_F_FROM_HOST && skb->mark == CALI_SKB_MARK_BYPASS) {
if (CALI_LOG_LEVEL >= CALI_LOG_LEVEL_DEBUG) {
/* This generates a bit more richer output for logging */
DECLARE_TC_CTX(_ctx,
.skb = skb,
.fwd = {
.res = TC_ACT_UNSPEC,
.reason = CALI_REASON_UNKNOWN,
},
.ipheader_len = IP_SIZE,
);
struct cali_tc_ctx *ctx = &_ctx;
CALI_DEBUG("New packet at ifindex=%d; mark=%x", skb->ifindex, skb->mark);
parse_packet_ip(ctx);
CALI_DEBUG("Final result=ALLOW (%d). Bypass mark set.", CALI_REASON_BYPASS);
}
return TC_ACT_UNSPEC;
}
if (CALI_F_NAT_IF) {
switch (skb->mark) {
case CALI_SKB_MARK_BYPASS:
/* We are turning a packet around to a local WEP using bpfnat
* iface, the WEP should do normal processing.
*/
skb->mark = 0UL;
CALI_LOG_IF(CALI_LOG_LEVEL_INFO,
"Final result=ALLOW (%d). Bypass mark set at bpfnat local WL", CALI_REASON_BYPASS);
return TC_ACT_UNSPEC;
case CALI_SKB_MARK_BYPASS_FWD:
/* We are turning a packet around from lo to a remote WEP using
* bpfnat iface. Next hop is a HEP and it should just forward the
* packet.
*/
{
__u32 mark = CALI_SKB_MARK_BYPASS;
skb->mark = mark;
}
CALI_LOG_IF(CALI_LOG_LEVEL_INFO,
"Final result=ALLOW (%d). Bypass mark set at bpfnat remote WL", CALI_REASON_BYPASS);
return TC_ACT_UNSPEC;
}
}
/* Optimisation: if XDP program has already accepted the packet,
* skip all processing. */
if (CALI_F_FROM_HEP) {
if (xdp2tc_get_metadata(skb) & CALI_META_ACCEPTED_BY_XDP) {
CALI_LOG_IF(CALI_LOG_LEVEL_INFO,
"Final result=ALLOW (%d). Accepted by XDP.", CALI_REASON_ACCEPTED_BY_XDP);
skb->mark = CALI_SKB_MARK_BYPASS;
return TC_ACT_UNSPEC;
}
}
/* Initialise the context, which is stored on the stack, and the state, which
* we use to pass data from one program to the next via tail calls. */
DECLARE_TC_CTX(_ctx,
.skb = skb,
.fwd = {
.res = TC_ACT_UNSPEC,
.reason = CALI_REASON_UNKNOWN,
},
.ipheader_len = IP_SIZE,
);
struct cali_tc_ctx *ctx = &_ctx;
__builtin_memset(ctx->state, 0, sizeof(*ctx->state));
CALI_DEBUG("New packet at ifindex=%d; mark=%x", skb->ifindex, skb->mark);
counter_inc(ctx, COUNTER_TOTAL_PACKETS);
if (CALI_LOG_LEVEL >= CALI_LOG_LEVEL_INFO || PROFILING) {
ctx->state->prog_start_time = bpf_ktime_get_ns();
}
/* We only try a FIB lookup and redirect for packets that are towards the host.
* For packets that are leaving the host namespace, routing has already been done. */
fwd_fib_set(&ctx->fwd, CALI_F_TO_HOST);
if (CALI_F_TO_HEP || CALI_F_TO_WEP) {
/* We're leaving the host namespace, check for other bypass mark bits.
* These are a bit more complex to handle so we do it after creating the
* context/state. */
switch (skb->mark & CALI_SKB_MARK_BYPASS_MASK) {
case CALI_SKB_MARK_BYPASS_FWD:
CALI_DEBUG("Packet approved for forward.");
counter_inc(ctx, CALI_REASON_BYPASS);
goto allow;
}
}
/* Parse the packet as far as the IP header; as a side-effect this validates the packet size
* is large enough for UDP. */
switch (parse_packet_ip(ctx)) {
#ifdef IPVER6
case PARSING_OK_V6:
// IPv6 Packet.
break;
#else
case PARSING_OK:
// IPv4 Packet.
break;
#endif
case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY:
// A packet that we automatically let through
fwd_fib_set(&ctx->fwd, false);
ctx->fwd.res = TC_ACT_UNSPEC;
goto finalize;
case PARSING_ERROR:
default:
// A malformed packet or a packet we don't support
CALI_DEBUG("Drop malformed or unsupported packet");
ctx->fwd.res = TC_ACT_SHOT;
goto finalize;
}
return pre_policy_processing(ctx);
allow:
finalize:
return forward_or_drop(ctx);
}
static CALI_BPF_INLINE int pre_policy_processing(struct cali_tc_ctx *ctx)
{
/* Copy fields that are needed by downstream programs from the packet to the state. */
tc_state_fill_from_iphdr(ctx);
if (CALI_F_LO && (GLOBAL_FLAGS & CALI_GLOBALS_LO_UDP_ONLY) && ctx->state->ip_proto != IPPROTO_UDP) {
CALI_DEBUG("Allowing because it is not UDP");
goto allow;
}
/* Parse out the source/dest ports (or type/code for ICMP). */
switch (tc_state_fill_from_nexthdr(ctx, dnat_should_decap())) {
case PARSING_ERROR:
goto deny;
case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY:
goto allow;
}
/* Now we've got as far as the UDP header, check if this is one of our VXLAN packets, which we
* use to forward traffic for node ports. */
if (dnat_should_decap() /* Compile time: is this a BPF program that should decap packets? */ &&
is_vxlan_tunnel(ctx, VXLAN_PORT) /* Is this a VXLAN packet? */ ) {
/* Decap it; vxlan_attempt_decap will revalidate the packet if needed. */
switch (vxlan_attempt_decap(ctx)) {
case -1:
/* Problem decoding the packet. */
goto deny;
case -2:
/* Non-BPF VXLAN packet from another Calico node. */
CALI_DEBUG("VXLAN packet from known Calico host, allow.");
fwd_fib_set(&(ctx->fwd), false);
goto allow;
}
/* Again, copy fields that are needed by downstream programs from the
* packet to the state after we unpacked the inner packet.
*/
tc_state_fill_from_iphdr(ctx);
/* Parse out the source/dest ports (or type/code for ICMP). */
switch (tc_state_fill_from_nexthdr(ctx, dnat_should_decap())) {
case PARSING_ERROR:
goto deny;
case PARSING_ALLOW_WITHOUT_ENFORCING_POLICY:
goto allow;
}
}
ctx->state->pol_rc = CALI_POL_NO_MATCH;
/* Do conntrack lookup before anything else */
ctx->state->ct_result = calico_ct_lookup(ctx);
calico_tc_process_ct_lookup(ctx);
allow:
finalize:
return forward_or_drop(ctx);
deny:
ctx->fwd.res = TC_ACT_SHOT;
goto finalize;
}
static CALI_BPF_INLINE void calico_tc_process_ct_lookup(struct cali_tc_ctx *ctx)
{
CALI_DEBUG("conntrack entry flags 0x%x", ctx->state->ct_result.flags);
/* We are forwarding a packet if it has a seen mark (that is another
* program has seen it already) and is either not routed through the
* bpfnat iface (which may be true for host traffic) or has the specific
* reasons set.
*/
bool forwarding = CALI_F_EGRESS &&
skb_mark_equals(ctx->skb, CALI_SKB_MARK_SEEN_MASK, CALI_SKB_MARK_SEEN) &&
(!skb_mark_equals(ctx->skb, CALI_SKB_MARK_FROM_NAT_IFACE_OUT, CALI_SKB_MARK_FROM_NAT_IFACE_OUT) ||
(skb_mark_equals(ctx->skb, CALI_SKB_MARK_BYPASS_MASK, CALI_SKB_MARK_FALLTHROUGH) ||
skb_mark_equals(ctx->skb, CALI_SKB_MARK_BYPASS_MASK, CALI_SKB_MARK_NAT_OUT) ||
skb_mark_equals(ctx->skb, CALI_SKB_MARK_BYPASS_MASK, CALI_SKB_MARK_MASQ) ||
skb_mark_equals(ctx->skb, CALI_SKB_MARK_BYPASS_MASK, CALI_SKB_MARK_SKIP_FIB)));
if (HAS_HOST_CONFLICT_PROG &&
/* Do not do conflict resolution for host-self loop. Unlike with
* traffic to another backend, we are not able to tell traffic to
* self via service from straight to self.
*/
!CALI_F_LO &&
/* Do conflict resolution on other device if it clashes with
* traffic looped via the NAT_IF but it hasn't been seen yet and
* is not looped via the NAT_IF, that is, it is from host, but not
* to a service.
*/
(ctx->state->ct_result.flags & CALI_CT_FLAG_VIA_NAT_IF) &&
!(ctx->skb->mark & (CALI_SKB_MARK_FROM_NAT_IFACE_OUT | CALI_SKB_MARK_SEEN))) {
CALI_DEBUG("Host source SNAT conflict");
CALI_JUMP_TO(ctx, PROG_INDEX_HOST_CT_CONFLICT);
CALI_DEBUG("Failed to call conflict resolution.");
goto deny;
}
/* Check if someone is trying to spoof a tunnel packet */
if (CALI_F_FROM_HEP && ct_result_tun_src_changed(ctx->state->ct_result.rc)) {
CALI_DEBUG("dropping tunnel pkt with changed source node");
goto deny;
}
if (ctx->state->ct_result.flags & CALI_CT_FLAG_NAT_OUT) {
ctx->state->flags |= CALI_ST_NAT_OUTGOING;
}
if (CALI_F_TO_HOST && !CALI_F_NAT_IF &&
(ct_result_rc(ctx->state->ct_result.rc) == CALI_CT_ESTABLISHED ||
ct_result_rc(ctx->state->ct_result.rc) == CALI_CT_ESTABLISHED_BYPASS) &&
ctx->state->ct_result.flags & CALI_CT_FLAG_VIA_NAT_IF) {
CALI_DEBUG("should route via bpfnatout");
ctx->fwd.mark |= CALI_SKB_MARK_TO_NAT_IFACE_OUT;
/* bpfnatout need to process the packet */
ct_result_set_rc(ctx->state->ct_result.rc, CALI_CT_ESTABLISHED);
}
if (ct_result_rpf_failed(ctx->state->ct_result.rc)) {
goto deny;
}
if (ct_result_rc(ctx->state->ct_result.rc) == CALI_CT_MID_FLOW_MISS) {
if (CALI_F_TO_HOST) {
/* Mid-flow miss: let iptables handle it in case it's an existing flow
* in the Linux conntrack table. We can't apply policy or DNAT because
* it's too late in the flow. iptables will drop if the flow is not
* known.
*/
CALI_DEBUG("CT mid-flow miss; fall through to iptables.");
ctx->fwd.mark = CALI_SKB_MARK_FALLTHROUGH;
fwd_fib_set(&ctx->fwd, false);
goto finalize;
} else {
if (CALI_F_HEP) {
// HEP egress for a mid-flow packet with no BPF or Linux CT state.
// This happens, for example, with asymmetric untracked policy,
// where we want the return path packet to be dropped if there is a
// HEP present (regardless of the policy configured on it, for
// consistency with the iptables dataplane's invalid CT state
// check), but allowed if there is no HEP, i.e. the egress interface
// is a plain data interface. Unfortunately we have no simple check
// for "is there a HEP here?" All we can do - below - is try to
// tail call the policy program; if that attempt returns, it means
// there is no HEP. So what we can do is set a state flag to record
// the situation that we are in, then let the packet continue. If
// we find that there is no policy program - i.e. no HEP - the
// packet is correctly allowed. If there is a policy program and it
// denies, fine. If there is a policy program and it allows, but
// the state flag is set, we drop the packet at the start of
// calico_tc_skb_accepted_entrypoint.
//
// Also we are mid-flow and so it's important to suppress any CT
// state creation - which normally follows when a packet is allowed
// through - because that CT state would not be correct. Basically,
// unless we see the SYN packet that starts a flow, we should never
// have CT state for that flow.
//
// Net, we can use the same flag, CALI_ST_SUPPRESS_CT_STATE, both to
// suppress CT state creation and to drop the packet if we find that
// there is a HEP present.
CALI_DEBUG("CT mid-flow miss to HEP with no Linux conntrack entry: "
"continue but suppressing CT state creation.");
ctx->state->flags |= CALI_ST_SUPPRESS_CT_STATE;
ct_result_set_rc(ctx->state->ct_result.rc, CALI_CT_NEW);
} else {
CALI_DEBUG("CT mid-flow miss away from host with no Linux "
"conntrack entry, drop.");
goto deny;
}
}
}
/* Skip policy if we get conntrack hit */
if (ct_result_rc(ctx->state->ct_result.rc) != CALI_CT_NEW) {
if (ctx->state->ct_result.flags & CALI_CT_FLAG_SKIP_FIB) {
ctx->state->flags |= CALI_ST_SKIP_FIB;
}
CALI_DEBUG("CT Hit");
if (ctx->state->ip_proto == IPPROTO_TCP && ct_result_is_syn(ctx->state->ct_result.rc)) {
CALI_DEBUG("Forcing policy on SYN");
if (ct_result_rc(ctx->state->ct_result.rc) == CALI_CT_ESTABLISHED_DNAT) {
/* Set DNAT info for policy */
ctx->state->post_nat_ip_dst = ctx->state->ct_result.nat_ip;
ctx->state->post_nat_dport = ctx->state->ct_result.nat_port;
} else {
ctx->state->post_nat_ip_dst = ctx->state->ip_dst;
ctx->state->post_nat_dport = ctx->state->dport;
}
goto syn_force_policy;
}
goto skip_policy;
}
/* No conntrack entry, check if we should do NAT */
nat_lookup_result nat_res = NAT_LOOKUP_ALLOW;
if (CALI_F_TO_HOST || (CALI_F_FROM_HOST && !skb_seen(ctx->skb) && !ctx->nat_dest /* no sport conflict */)) {
ctx->nat_dest = calico_nat_lookup_tc(ctx,
&ctx->state->ip_src, &ctx->state->ip_dst,
ctx->state->ip_proto, ctx->state->dport,
!ip_void(ctx->state->tun_ip), &nat_res);
}
if (nat_res == NAT_FE_LOOKUP_DROP) {
CALI_DEBUG("Packet is from an unauthorised source: DROP");
deny_reason(ctx, CALI_REASON_UNAUTH_SOURCE);
goto deny;
}
if (ctx->nat_dest != NULL) {
ctx->state->post_nat_ip_dst = ctx->nat_dest->addr;
ctx->state->post_nat_dport = ctx->nat_dest->port;
} else if (nat_res == NAT_NO_BACKEND) {
/* send icmp port unreachable if there is no backend for a service */
#ifdef IPVER6
ctx->state->icmp_type = ICMPV6_DEST_UNREACH;
ctx->state->icmp_code = ICMPV6_PORT_UNREACH;
#else
ctx->state->icmp_type = ICMP_DEST_UNREACH;
ctx->state->icmp_code = ICMP_PORT_UNREACH;
#endif
ip_set_void(ctx->state->tun_ip);
goto icmp_send_reply;
} else {
ctx->state->post_nat_ip_dst = ctx->state->ip_dst;
ctx->state->post_nat_dport = ctx->state->dport;
if (nat_res == NAT_EXCLUDE) {
/* We want such packets to go through the host namespace. The main
* usecase of this is node-local-dns.
*/
ctx->state->flags |= CALI_ST_SKIP_FIB;
ctx->state->flags |= CALI_ST_NAT_EXCLUDE;
}
}
syn_force_policy:
/* DNAT in state is set correctly now */
if ((ip_void(ctx->state->tun_ip) && CALI_F_FROM_HEP) && !CALI_F_NAT_IF && !CALI_F_LO) {
if (
#ifdef IPVER6
ctx->state->ip_proto != IPPROTO_ICMPV6 &&
#endif
!hep_rpf_check(ctx)) {
goto deny;
}
}
if (CALI_F_TO_WEP && (!skb_seen(ctx->skb) ||
skb_mark_equals(ctx->skb, CALI_SKB_MARK_FROM_NAT_IFACE_OUT, CALI_SKB_MARK_FROM_NAT_IFACE_OUT)) &&
cali_rt_flags_local_host(cali_rt_lookup_flags(&ctx->state->ip_src))) {
/* Host to workload traffic always allowed. We discount traffic that was
* seen by another program since it must have come in via another interface.
*/
CALI_DEBUG("Packet is from the host: ACCEPT");
goto skip_policy;
}
if (CALI_F_FROM_WEP
#ifdef IPVER6
&& !(ctx->state->ip_proto == IPPROTO_ICMPV6 && ip_link_local(ctx->state->ip_src))
#endif
) {
struct cali_rt *r = cali_rt_lookup(&ctx->state->ip_src);
/* Do RPF check since it's our responsibility to police that. */
if (!wep_rpf_check(ctx, r)) {
goto deny;
}
// Check whether the workload needs outgoing NAT to this address.
if (r->flags & CALI_RT_NAT_OUT) {
struct cali_rt *rt = cali_rt_lookup(&ctx->state->post_nat_ip_dst);
enum cali_rt_flags flags = CALI_RT_UNKNOWN;
if (rt) {
flags = rt->flags;
}
if (!(flags & CALI_RT_IN_POOL) && !cali_rt_flags_local_host(flags)) {
CALI_DEBUG("Source is in NAT-outgoing pool "
"but dest is not, need to SNAT.");
ctx->state->flags |= CALI_ST_NAT_OUTGOING;
}
}
/* If 3rd party CNI is used and dest is outside cluster. See commit fc711b192f for details. */
if (!(r->flags & CALI_RT_IN_POOL)) {
CALI_DEBUG("Source " IP_FMT " not in IP pool", debug_ip(ctx->state->ip_src));
r = cali_rt_lookup(&ctx->state->post_nat_ip_dst);
if (!r || !(r->flags & (CALI_RT_WORKLOAD | CALI_RT_HOST))) {
CALI_DEBUG("Outside cluster dest " IP_FMT "", debug_ip(ctx->state->post_nat_ip_dst));
ctx->state->flags |= CALI_ST_SKIP_FIB;
}
}
}
/* [SMC] I had to add this revalidation when refactoring the conntrack code to use the context and
* adding possible packet pulls in the VXLAN logic. I believe it is spurious but the verifier is
* not clever enough to spot that we'd have already bailed out if one of the pulls failed. */
if (skb_refresh_validate_ptrs(ctx, UDP_SIZE)) {
deny_reason(ctx, CALI_REASON_SHORT);
CALI_DEBUG("Too short");
goto deny;
}
ctx->state->pol_rc = CALI_POL_NO_MATCH;
if (ctx->nat_dest) {
ctx->state->nat_dest.addr = ctx->nat_dest->addr;
ctx->state->nat_dest.port = ctx->nat_dest->port;
} else {
ip_set_void(ctx->state->nat_dest.addr);
ctx->state->nat_dest.port = 0;
}
// For the case where the packet was sent from a socket on this host, get the
// sending socket's cookie, so we can reverse a DNAT that the CTLB may have done.
// This allows us to give the policy program the pre-DNAT destination as well as
// the post-DNAT destination in all cases.
__u64 cookie = bpf_get_socket_cookie(ctx->skb);
if (cookie) {
CALI_DEBUG("Socket cookie: %x", cookie);
struct ct_nats_key ct_nkey = {
.cookie = cookie,
.proto = ctx->state->ip_proto,
.ip = ctx->state->ip_dst,
.port = host_to_ctx_port(ctx->state->dport),
};
// If we didn't find a CTLB NAT entry then use the packet's own IP/port for the
// pre-DNAT values that's set by tc_state_fill_from_iphdr() and
// tc_state_fill_from_nextheader().
struct sendrec_val *revnat = cali_ct_nats_lookup_elem(&ct_nkey);
if (revnat) {
CALI_DEBUG("Got cali_ct_nats entry; flow was NATted by CTLB.");
ctx->state->pre_nat_ip_dst = revnat->ip;
ctx->state->pre_nat_dport = ctx_port_to_host(revnat->port);
}
}
if (!forwarding && rt_addr_is_local_host(&ctx->state->ip_src)) {
CALI_DEBUG("Source IP is local host.");
if (CALI_F_TO_HEP && is_failsafe_out(ctx->state->ip_proto, ctx->state->post_nat_dport, ctx->state->post_nat_ip_dst)) {
CALI_DEBUG("Outbound failsafe port: %d. Skip policy.", ctx->state->post_nat_dport);
counter_inc(ctx, CALI_REASON_ACCEPTED_BY_FAILSAFE);
goto skip_policy;
}
ctx->state->flags |= CALI_ST_SRC_IS_HOST;
}
struct cali_rt *dest_rt = NULL;
// Route lookup is not done for those packets which are nat excluded, where there
// is a nat hit, but we don't resolve (such as node local DNS cache).
if (!(ctx->state->flags & CALI_ST_NAT_EXCLUDE)) {
dest_rt = cali_rt_lookup(&ctx->state->post_nat_ip_dst);
}
if (!dest_rt) {
CALI_DEBUG("No route for post DNAT dest " IP_FMT "", debug_ip(ctx->state->post_nat_ip_dst));
if (CALI_F_FROM_HEP) {
/* Disable FIB, let the packet go through the host after it is
* policed. It is ingress into the system and we do not know what
* exactly is the packet's destination. It may be a local VM or
* something similar and we let the host to route it or dump it.
*
* https://github.com/projectcalico/calico/issues/6450
*/
ctx->state->flags |= CALI_ST_SKIP_FIB;
}
goto do_policy;
}
/* If the dest route is a blackhole route, drop/reject the packet.
* This is based on the service loop prevention configuration.
* If ServiceLoopPrevention = Drop, route is a blackhole drop route.
* If ServiceLoopPrevention = Reject, route is a blackhole reject route.
* If ServiceLoopPrevention = Disabled, these routes are not programmed.
*/
if (CALI_F_TO_HOST) {
if (cali_rt_is_blackhole_drop(dest_rt)) {
CALI_DEBUG("Packet hit a black hole route: DROP");
deny_reason(ctx, CALI_REASON_BLACK_HOLE);
goto deny;
}
if (cali_rt_is_blackhole_reject(dest_rt)) {
CALI_DEBUG("Packet hit a black hole route: REJECT");
deny_reason(ctx, CALI_REASON_BLACK_HOLE);
#ifdef IPVER6
ctx->state->icmp_type = ICMPV6_DEST_UNREACH;
ctx->state->icmp_code = ICMPV6_PORT_UNREACH;
#else
ctx->state->icmp_type = ICMP_DEST_UNREACH;
ctx->state->icmp_code = ICMP_PORT_UNREACH;
#endif
goto icmp_send_reply;
}
}
if (cali_rt_flags_local_host(dest_rt->flags)) {
CALI_DEBUG("Post-NAT dest IP is local host.");
if (CALI_F_FROM_HEP && is_failsafe_in(ctx->state->ip_proto, ctx->state->post_nat_dport, ctx->state->ip_src)) {
CALI_DEBUG("Inbound failsafe port: %d. Skip policy.", ctx->state->post_nat_dport);
counter_inc(ctx, CALI_REASON_ACCEPTED_BY_FAILSAFE);
goto skip_policy;
}
ctx->state->flags |= CALI_ST_DEST_IS_HOST;
} else if (CALI_F_FROM_HEP && !ctx->nat_dest && !cali_rt_is_local(dest_rt)) {
/* Disable FIB, let the packet go through the host after it is
* policed. It is ingress into the system and we got a packet, which is
* not for this host, and it wasn't resolved as a service and it is not
* for a local workload either. But we hit a route so it may be some L2
* broadcast, we do not quite know. Let the host route it or dump it.
*
* https://github.com/projectcalico/calico/issues/8918
*/
ctx->state->flags |= CALI_ST_SKIP_FIB;
}
if (CALI_F_TO_HEP && ctx->nat_dest && !skb_seen(ctx->skb) && !(ctx->state->flags & CALI_ST_HOST_PSNAT)) {
CALI_DEBUG("Host accesses nodeport backend " IP_FMT ":%d",
debug_ip(ctx->state->post_nat_ip_dst), ctx->state->post_nat_dport);
CALI_DEBUG("Host accesses nodeport state->flags 0x%x", ctx->state->flags);
if (cali_rt_flags_local_workload(dest_rt->flags)) {
CALI_DEBUG("NP redir on HEP - skip policy");
ctx->state->flags |= CALI_ST_CT_NP_LOOP;
ctx->state->pol_rc = CALI_POL_ALLOW;
goto skip_policy;
} else if (cali_rt_flags_remote_workload(dest_rt->flags)) {
if (CALI_F_LO) {
CALI_DEBUG("NP redir remote on LO");
ctx->state->flags |= CALI_ST_CT_NP_LOOP;
} else if (CALI_F_MAIN && cali_rt_is_tunneled(dest_rt)) {
CALI_DEBUG("NP redir remote on HEP to tunnel");
ctx->state->flags |= CALI_ST_CT_NP_LOOP;
}
ctx->state->flags |= CALI_ST_CT_NP_REMOTE;
}
}
do_policy:
#ifdef IPVER6
if (ctx->state->ip_proto == IPPROTO_ICMPV6) {
switch (icmp_hdr(ctx)->icmp6_type) {
case 130: /* multicast listener query */
case 131: /* multicast listener report */
case 132: /* multicast listener done */
case 133: /* router solicitation */
case 135: /* neighbor solicitation */
case 136: /* neighbor advertisement */
CALI_DEBUG("allow ICMPv6 type %d", icmp_hdr(ctx)->icmp6_type);
/* We use iptables to allow it only to the host. */
if (CALI_F_TO_HOST) {
ctx->state->flags |= CALI_ST_SKIP_FIB;
}
goto skip_policy;
}
}
#endif
if (CALI_F_TO_WEP && ctx->skb->mark == CALI_SKB_MARK_MASQ) {
CALI_DEBUG("MASQ to self - using dest as source for policy.");
ctx->state->ip_src_masq = ctx->state->ip_src;
ctx->state->ip_src = ctx->state->ip_dst;
}
CALI_DEBUG("About to jump to policy program.");
CALI_JUMP_TO_POLICY(ctx);
if (CALI_F_HEP) {
CALI_DEBUG("HEP with no policy, allow.");
goto skip_policy;
} else {
/* should not reach here */
CALI_DEBUG("WEP with no policy, deny.");
goto deny;
}
icmp_send_reply:
CALI_JUMP_TO(ctx, PROG_INDEX_ICMP);
/* should not reach here */
goto deny;
skip_policy:
ctx->state->pol_rc = CALI_POL_ALLOW;
ctx->state->flags |= CALI_ST_SKIP_POLICY;
CALI_JUMP_TO(ctx, PROG_INDEX_ALLOWED);
CALI_DEBUG("jump failed");
/* should not reach here */
goto deny;
finalize:
return;
deny:
ctx->fwd.res = TC_ACT_SHOT;
}
enum do_nat_res {
NAT_DENY,
NAT_ALLOW,
NAT_ENCAP_ALLOW,
NAT_ICMP_TOO_BIG,
};
static CALI_BPF_INLINE enum do_nat_res do_nat(struct cali_tc_ctx *ctx,
size_t ip_hdr_offset,
size_t l4_csum_off,
bool ct_related,
int ct_rc,
struct ct_create_ctx *ct_ctx_nat,
bool *is_dnat,
__u32 *seen_mark,
bool inner_icmp)
{
bool encap_needed = false;
#ifdef IPVER6
size_t l3_csum_off = 0;
#else
size_t l3_csum_off = ip_hdr_offset + offsetof(struct iphdr, check);
#endif
switch (ct_rc){
case CALI_CT_ESTABLISHED_DNAT:
if (CALI_F_FROM_HEP && !ip_void(STATE->tun_ip) && ct_result_np_node(STATE->ct_result)) {
/* Packet is returning from a NAT tunnel,
* already SNATed, just forward it.
*/
*seen_mark = CALI_SKB_MARK_BYPASS_FWD;
CALI_DEBUG("returned from NAT tunnel");
goto allow;
}
STATE->post_nat_ip_dst = STATE->ct_result.nat_ip;
STATE->post_nat_dport = STATE->ct_result.nat_port;
/* fall through */
case CALI_CT_NEW:
/* We may not do a true DNAT here if we are resolving service source port
* conflict with host->pod w/o service. See calico_tc_host_ct_conflict().
*/
*is_dnat = !ip_equal(STATE->ip_dst, STATE->post_nat_ip_dst) || STATE->dport != STATE->post_nat_dport;
CALI_DEBUG("CT: DNAT to " IP_FMT ":%d",
debug_ip(STATE->post_nat_ip_dst), STATE->post_nat_dport);
encap_needed = dnat_should_encap();
/* We have not created the conntrack yet since we did not know
* if we need encap or not. Must do before MTU check and before
* we jump to do the encap.
*/
if (ct_ctx_nat /* iff CALI_CT_NEW */) {
struct cali_rt * rt;
if (encap_needed) {
/* When we need to encap, we need to find out if the backend is
* local or not. If local, we actually do not need the encap.
*/
rt = cali_rt_lookup(&STATE->post_nat_ip_dst);
if (!rt) {
deny_reason(ctx, CALI_REASON_RT_UNKNOWN);
goto deny;
}
CALI_DEBUG("rt found for " IP_FMT " local %d",
debug_ip(STATE->post_nat_ip_dst), !!cali_rt_is_local(rt));
encap_needed = !cali_rt_is_local(rt);
if (encap_needed) {
if (CALI_F_FROM_HEP && ip_void(STATE->tun_ip)) {
if (CALI_F_DSR) {
ct_ctx_nat->flags |= CALI_CT_FLAG_DSR_FWD |
(STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR);
}
ct_ctx_nat->flags |= CALI_CT_FLAG_NP_FWD;
}
ct_ctx_nat->allow_return = true;
ct_ctx_nat->tun_ip = rt->next_hop;
STATE->ip_dst = rt->next_hop;
} else if (cali_rt_is_workload(rt) &&
!ip_equal(STATE->ip_dst, STATE->post_nat_ip_dst) &&
!CALI_F_NAT_IF) {
/* Packet arrived from a HEP for a workload and we're
* about to NAT it. We can't rely on the kernel's RPF check
* to do the right thing here in the presence of source
* based routing because the kernel would do the RPF check
* based on the post-NAT dest IP and that may give the wrong
* result.
*
* Marking the packet allows us to influence which routing
* rule is used.
*/
ct_ctx_nat->flags |= CALI_CT_FLAG_EXT_LOCAL;
STATE->ct_result.flags |= CALI_CT_FLAG_EXT_LOCAL;
CALI_DEBUG("CT_NEW marked with FLAG_EXT_LOCAL");
}
}
if (CALI_F_FROM_WEP && ip_equal(STATE->ip_src, STATE->post_nat_ip_dst)) {
CALI_DEBUG("New loopback SNAT");
ct_ctx_nat->flags |= CALI_CT_FLAG_SVC_SELF;
STATE->ct_result.flags |= CALI_CT_FLAG_SVC_SELF;
}
ct_ctx_nat->type = CALI_CT_TYPE_NAT_REV;
int err;
if ((err = conntrack_create(ctx, ct_ctx_nat))) {
CALI_DEBUG("Creating NAT conntrack failed with %d", err);
deny_reason(ctx, CALI_REASON_CT_CREATE_FAILED);
goto deny;
}
STATE->ct_result.nat_sip = ct_ctx_nat->src;
STATE->ct_result.nat_sport = ct_ctx_nat->sport;
} else {
if (encap_needed && ct_result_np_node(STATE->ct_result)) {
CALI_DEBUG("CT says encap to node " IP_FMT "", debug_ip(STATE->ct_result.tun_ip));
STATE->ip_dst = STATE->ct_result.tun_ip;
} else {
encap_needed = false;
}
}
if (encap_needed) {
if (!(STATE->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) &&
ip_is_dnf(ip_hdr(ctx)) && vxlan_encap_too_big(ctx)) {
CALI_DEBUG("Request packet with DNF set is too big");
goto icmp_too_big;
}
STATE->ip_src = HOST_IP;
*seen_mark = CALI_SKB_MARK_BYPASS_FWD; /* Do FIB if possible */
CALI_DEBUG("marking CALI_SKB_MARK_BYPASS_FWD");
goto nat_encap;
}
ip_hdr_set_ip(ctx, saddr, STATE->ct_result.nat_sip);
ip_hdr_set_ip(ctx, daddr, STATE->post_nat_ip_dst);
switch (STATE->ip_proto) {
case IPPROTO_TCP:
if (STATE->ct_result.nat_sport) {
CALI_DEBUG("Fixing TCP source port from %d to %d",
bpf_ntohs(tcp_hdr(ctx)->source), STATE->ct_result.nat_sport);
tcp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_sport);
}
tcp_hdr(ctx)->dest = bpf_htons(STATE->post_nat_dport);
break;
case IPPROTO_UDP:
if (STATE->ct_result.nat_sport) {
CALI_DEBUG("Fixing UDP source port from %d to %d",
bpf_ntohs(udp_hdr(ctx)->source), STATE->ct_result.nat_sport);
udp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_sport);
}
udp_hdr(ctx)->dest = bpf_htons(STATE->post_nat_dport);
break;
}
CALI_DEBUG("DNAT L3 csum at %d L4 csum at %d", l3_csum_off, l4_csum_off);
if (l4_csum_off) {
if (skb_nat_l4_csum(ctx, l4_csum_off,
STATE->ip_src,
STATE->ct_result.nat_sip,
STATE->ip_dst,
STATE->post_nat_ip_dst,
bpf_htons(STATE->dport),
bpf_htons(STATE->post_nat_dport),
bpf_htons(STATE->sport),
bpf_htons(STATE->ct_result.nat_sport ? : STATE->sport),
STATE->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0,
inner_icmp)) {
goto deny;
}
}
if (inner_icmp) {
/* updating related icmp inner header. Because it can be anywhere
* and we are not updating in-place, we need to write it back
* before we update the csum.
*/
if (bpf_skb_store_bytes(ctx->skb, ip_hdr_offset, ip_hdr(ctx), IP_SIZE, 0)) {
CALI_DEBUG("Too short for IP write back");
deny_reason(ctx, CALI_REASON_SHORT);
goto deny;
}
if (bpf_skb_store_bytes(ctx->skb, ip_hdr_offset + ctx->ipheader_len, ctx->nh, 8, 0)) {
CALI_DEBUG("Too short for L4 ports write back");
deny_reason(ctx, CALI_REASON_SHORT);
goto deny;
}
}
#ifndef IPVER6
if (!inner_icmp) {
if (bpf_l3_csum_replace(ctx->skb, l3_csum_off, STATE->ip_src,
STATE->ct_result.nat_sip, 4) ||
bpf_l3_csum_replace(ctx->skb, l3_csum_off,
STATE->ip_dst, STATE->post_nat_ip_dst, 4)) {
deny_reason(ctx, CALI_REASON_CSUM_FAIL);
goto deny;
}
}
#endif
/* From now on, the packet has a new source IP */
if (!ip_void(STATE->ct_result.nat_sip)) {
STATE->ip_src = STATE->ct_result.nat_sip;
}
/* Handle returning ICMP related to tunnel
*
* N.B. we assume that we can fit in the MTU. Since it is ICMP
* and even though Linux sends up to min ipv4 MTU, it is
* unlikely that we are anywhere to close the MTU limit. If we
* are, we need to fail anyway.
*/
if (ct_related && STATE->ip_proto == IPPROTO_ICMP
&& !ip_void(STATE->ct_result.tun_ip)
&& (!CALI_F_DSR || (STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR))) {
if (dnat_return_should_encap()) {
CALI_DEBUG("Returning related ICMP from workload to tunnel");
} else if (CALI_F_TO_HEP) {
/* Special case for ICMP error being returned by the host with the
* backing workload into the tunnel back to the original host. It is
* ICMP related and there is a return tunnel path. We need to change
* both the source and destination at once.
*
* XXX the packet was routed to the original client as if it was XXX
* DSR and we might not be on the right iface!!! Should we XXX try
* to reinject it to fix the routing?
*/
CALI_DEBUG("Returning related ICMP from host to tunnel");
}
STATE->ip_src = HOST_IP;
STATE->ip_dst = STATE->ct_result.tun_ip;
goto nat_encap;
}
STATE->dport = STATE->post_nat_dport;
STATE->ip_dst = STATE->post_nat_ip_dst;
goto allow;
case CALI_CT_ESTABLISHED_SNAT:
CALI_DEBUG("CT: SNAT from " IP_FMT ":%d",
debug_ip(STATE->ct_result.nat_ip), STATE->ct_result.nat_port);
if (dnat_return_should_encap() && !ip_void(STATE->ct_result.tun_ip)) {
if (CALI_F_DSR && !(STATE->ct_result.flags & CALI_CT_FLAG_NP_NO_DSR)) {
/* SNAT will be done after routing, when leaving HEP */
CALI_DEBUG("DSR enabled, skipping SNAT + encap");
goto allow;
}
if (!(STATE->ip_proto == IPPROTO_TCP && skb_is_gso(ctx->skb)) &&
ip_is_dnf(ip_hdr(ctx)) && vxlan_encap_too_big(ctx)) {
CALI_DEBUG("Return ICMP mtu is too big");
goto icmp_too_big;
}
}
// Actually do the NAT.
ip_hdr_set_ip(ctx, saddr, STATE->ct_result.nat_ip);
ip_hdr_set_ip(ctx, daddr, STATE->ct_result.nat_sip);
switch (ctx->state->ip_proto) {
case IPPROTO_TCP:
tcp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_port);
if (STATE->ct_result.nat_sport) {
CALI_DEBUG("Fixing TCP dest port from %d to %d",
bpf_ntohs(tcp_hdr(ctx)->dest), STATE->ct_result.nat_sport);
tcp_hdr(ctx)->dest = bpf_htons(STATE->ct_result.nat_sport);
}
break;
case IPPROTO_UDP:
udp_hdr(ctx)->source = bpf_htons(STATE->ct_result.nat_port);
if (STATE->ct_result.nat_sport) {
CALI_DEBUG("Fixing UDP dest port from %d to %d",
bpf_ntohs(tcp_hdr(ctx)->dest), STATE->ct_result.nat_sport);
udp_hdr(ctx)->dest = bpf_htons(STATE->ct_result.nat_sport);
}
break;
}
CALI_DEBUG("SNAT L3 csum at %d L4 csum at %d", l3_csum_off, l4_csum_off);
if (l4_csum_off && skb_nat_l4_csum(ctx, l4_csum_off,
STATE->ip_src, STATE->ct_result.nat_ip,
STATE->ip_dst, STATE->ct_result.nat_sip,
bpf_htons(STATE->dport),
bpf_htons(STATE->ct_result.nat_sport ? : STATE->dport),
bpf_htons(STATE->sport), bpf_htons(STATE->ct_result.nat_port),
STATE->ip_proto == IPPROTO_UDP ? BPF_F_MARK_MANGLED_0 : 0,
inner_icmp)) {
deny_reason(ctx, CALI_REASON_CSUM_FAIL);
goto deny;
}
if (inner_icmp) {
/* updating related icmp inner header. Because it can be anywhere