-
Notifications
You must be signed in to change notification settings - Fork 1
/
draft-ietf-grow-ix-bgp-route-server-operations-05.xml
executable file
·778 lines (735 loc) · 36.3 KB
/
draft-ietf-grow-ix-bgp-route-server-operations-05.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
<?xml version="1.0" encoding="us-ascii"?>
<!DOCTYPE rfc PUBLIC '' "http://xml.resource.org/authoring/rfc2629.dtd"[
<!ENTITY I-D.ietf-idr-ix-bgp-route-server PUBLIC '' "http://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-idr-ix-bgp-route-server.xml">
<!ENTITY RFC1997 PUBLIC '' "http://xml.resource.org/public/rfc/bibxml/reference.RFC.1997.xml">
<!ENTITY RFC2119 PUBLIC '' "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
<!ENTITY RFC2622 PUBLIC '' "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2622.xml">
<!ENTITY RFC4271 PUBLIC '' "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4271.xml">
<!ENTITY RFC4360 PUBLIC '' "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4360.xml">
<!ENTITY RFC4456 PUBLIC '' "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4456.xml">
<!ENTITY RFC4893 PUBLIC '' "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4893.xml">
<!ENTITY RFC5291 PUBLIC '' "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5291.xml">
]>
<?xml-stylesheet type='text/xsl'
href="http://greenbytes.de/tech/webdav/rfc2629xslt/rfc2629.xslt" ?>
<?rfc strict="yes" ?>
<?rfc toc="yes"?>
<!-- generate a ToC -->
<?rfc tocdepth="4"?>
<!-- the number of levels of subsections in ToC. default: 3 -->
<!-- control references -->
<?rfc symrefs="yes"?>
<!-- use symbolic references tags, i.e, [RFC2119] instead of [1] -->
<?rfc sortrefs="yes" ?>
<!-- sort the reference entries alphabetically -->
<!-- control vertical white space
(using these PIs as follows is recommended by the RFC Editor) -->
<?rfc compact="yes" ?>
<!-- do not start each main section on a new page -->
<?rfc subcompact="no" ?>
<!-- keep one blank line between list items -->
<!-- end of list of popular I-D processing instructions -->
<rfc category="info"
docName="draft-ietf-grow-ix-bgp-route-server-operations-05"
ipr="trust200902"
obsoletes=""
updates=""
submissionType="IETF"
xml:lang="en">
<!-- category values: std, bcp, info, exp, and historic
ipr values: full3667, noModification3667, noDerivatives3667
you can add the attributes updates="NNNN" and obsoletes="NNNN"
they will automatically be output with "(if approved)" -->
<!-- ***** FRONT MATTER ***** -->
<front>
<title abbrev="IXP BGP Route Server Operations">
Internet Exchange BGP Route Server Operations
</title>
<author initials="N" surname="Hilliard" fullname="Nick Hilliard">
<organization>INEX</organization>
<address>
<postal>
<street>4027 Kingswood Road</street>
<city>Dublin</city>
<code>24</code>
<country>IE</country>
</postal>
<email>[email protected]</email>
</address>
</author>
<author initials="E" surname="Jasinska" fullname="Elisa Jasinska">
<organization>BigWave IT</organization>
<address>
<postal>
<street>ul. Skawinska 27/7</street>
<city>Krakow</city>
<region>MP</region>
<code>31-066</code>
<country>Poland</country>
</postal>
<email>[email protected]</email>
</address>
</author>
<author initials="R" surname="Raszuk" fullname="Robert Raszuk">
<organization>Mirantis Inc.</organization>
<address>
<postal>
<street>615 National Ave. #100</street>
<city>Mt View</city>
<region>CA</region>
<code>94043</code>
<country>USA</country>
</postal>
<email>[email protected]</email>
</address>
</author>
<author initials="N" surname="Bakker" fullname="Niels Bakker">
<organization>Akamai Technologies B.V.</organization>
<address>
<postal>
<street>Kingsfordweg 151</street>
<city>Amsterdam</city>
<code>1043 GR</code>
<country>NL</country>
</postal>
<email>[email protected]</email>
</address>
</author>
<date month="June" year="2015" />
<area>Routing</area>
<workgroup>GROW Working Group</workgroup>
<keyword>I-D</keyword>
<keyword>Internet-Draft</keyword>
<keyword>GROW</keyword>
<abstract>
<t>
The popularity of Internet exchange points (IXPs) brings new
challenges to interconnecting networks. While bilateral eBGP sessions
between exchange participants were historically the most common means
of exchanging reachability information over an IXP, the overhead
associated with this interconnection method causes serious operational
and administrative scaling problems for IXP participants.
</t>
<t>
Multilateral interconnection using Internet route servers can
dramatically reduce the administrative and operational overhead
associated with connecting to IXPs; in some cases, route servers
are used by IXP participants as their preferred means of exchanging
routing information.
</t>
<t>
This document describes operational considerations for multilateral
interconnections at IXPs.
</t>
</abstract>
</front>
<middle>
<section title="Introduction">
<t>
Internet exchange points (IXPs) provide IP data interconnection
facilities for their participants, using data link layer protocols
such as Ethernet. The Border Gateway Protocol (BGP) <xref
target="RFC4271" /> is normally used to facilitate exchange of
network reachability information over these media.
</t>
<t>
As bilateral interconnection between IXP participants requires
operational and administrative overhead, BGP route servers <xref
target="I-D.ietf-idr-ix-bgp-route-server" /> are often deployed by IXP
operators to provide a simple and convenient means of interconnecting
IXP participants with each other. A route server redistributes
BGP routes received from its BGP clients to other clients according to a
pre-specified policy, and it can be viewed as similar to an eBGP
equivalent of an iBGP <xref target="RFC4456" /> route reflector.
</t>
<t>
Route servers at IXPs require careful management and it is important
for route server operators to thoroughly understand both how they work
and what their limitations are. In this document, we discuss several
issues of operational relevance to route server operators and provide
recommendations to help route server operators provision a reliable
interconnection service.
</t>
<section title="Notational Conventions">
<t>
The keywords "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and
"OPTIONAL" in this document are to be interpreted as described in
<xref target="RFC2119" />.
</t>
<t>
The phrase "BGP route" in this document should be interpreted as
the term "Route" described in <xref target="RFC4271" />.
</t>
</section>
</section>
<section title="Bilateral BGP Sessions">
<t>
Bilateral interconnection is a method of interconnecting routers using
individual BGP sessions between each pair of participant routers on an IXP, in
order to exchange reachability information. If an IXP participant
wishes to implement an open interconnection policy - i.e. a policy of
interconnecting with as many other IXP participants as possible - it
is necessary for the participant to liaise with each of their intended
interconnection partners. Interconnection can then be implemented
bilaterally by configuring a BGP session on both participants' routers
to exchange network reachability information. If each exchange
participant interconnects with each other participant, a full mesh of
BGP sessions is needed, as shown in <xref target="ixp_interconnection"
/>.
</t>
<figure title="Full-Mesh Interconnection at an IXP" anchor="ixp_interconnection">
<preamble></preamble>
<artwork align="center">
___ ___
/ \ / \
..| AS1 |..| AS2 |..
: \___/____\___/ :
: | \ / | :
: | \ / | :
: IXP | \/ | :
: | /\ | :
: | / \ | :
: _|_/____\_|_ :
: / \ / \ :
..| AS3 |..| AS4 |..
\___/ \___/
</artwork>
<postamble></postamble>
</figure>
<t>
<xref target="ixp_interconnection" /> depicts an IXP platform with
four connected routers, administered by four separate exchange
participants, each of them with a locally unique autonomous system
number: AS1, AS2, AS3 and AS4. The lines between the routers depict
BGP sessions; the dotted edge represents the IXP border. Each of
these four participants wishes to exchange traffic with all other
participants; this is accomplished by configuring a full mesh of BGP
sessions on each router connected to the exchange, resulting in 6
BGP sessions across the IXP fabric.
</t>
<t>
The number of BGP sessions at an exchange has an upper bound of
n*(n-1)/2, where n is the number of routers at the exchange. As many
exchanges have large numbers of participating networks, the amount of
administrative and operation overhead required to implement an open
interconnection scales quadratically. New participants
to an IXP require significant initial resourcing in order to gain
value from their IXP connection, while existing exchange participants
need to commit ongoing resources in order to benefit from
interconnecting with these new participants.
</t>
</section>
<section title="Multilateral Interconnection">
<t>
Multilateral interconnection is implemented using a route server
configured to distribute BGP routes among client routers. The route
server preserves the BGP NEXT_HOP attribute from all received BGP
routes and passes them with unchanged NEXT_HOP to its route server
clients according to its configured routing policy, as described in
<xref target="I-D.ietf-idr-ix-bgp-route-server" />. Using this
method of exchanging BGP routes, an IXP participant router can
receive an aggregated list of BGP routes from all other route server
clients using a single BGP session to the route server instead of
depending on BGP sessions with each other router at the exchange.
This reduces the overall number of BGP sessions at an Internet
exchange from n*(n-1)/2 to n, where n is the number of routers at
the exchange.
</t>
<t>
Although a route server uses BGP to exchange reachability information
with each of its clients, it does not forward traffic itself and is
therefore not a router.
</t>
<t>
In practical terms, this allows dense interconnection between IXP
participants with low administrative overhead and significantly
simpler and smaller router configurations. In particular, new IXP
participants benefit from immediate and extensive interconnection,
while existing route server participants receive reachability
information from these new participants without necessarily having to
modify their configurations.
</t>
<figure title="IXP-based Interconnection with Route Server" anchor="rs_interconnection">
<preamble></preamble>
<artwork align="center">
___ ___
/ \ / \
..| AS1 |..| AS2 |..
: \___/ \___/ :
: \ / :
: \ / :
: \__/ :
: IXP / \ :
: | RS | :
: \____/ :
: / \ :
: / \ :
: __/ \__ :
: / \ / \ :
..| AS3 |..| AS4 |..
\___/ \___/
</artwork>
<postamble></postamble>
</figure>
<t>
As illustrated in <xref target="rs_interconnection" />, each router on
the IXP fabric requires only a single BGP session to the route server,
from which it can receive reachability information for all other
routers on the IXP which also connect to the route server.
</t>
</section>
<section title="Operational Considerations for Route Server Installations" anchor="ops_considerations">
<section title="Path Hiding" anchor="path_hiding">
<t>
"Path hiding" is a term used in <xref
target="I-D.ietf-idr-ix-bgp-route-server" /> to describe the process
whereby a route server may mask individual paths by applying
conflicting routing policies to its Loc-RIB. When this happens,
route server clients receive incomplete information from the route
server about network reachability.
</t>
<t>
There are several approaches which may be used to mitigate against
the effect of path hiding; these are described in <xref
target="I-D.ietf-idr-ix-bgp-route-server" />. However, the only
method which does not require explicit support from the route server
client is for the route server itself to maintain a individual
Loc-RIB for each client which is the subject of conflicting routing
policies.
</t>
</section>
<section title="Route Server Scaling" anchor="ops_scaling">
<t>
While deployment of multiple Loc-RIBs on the route server presents a
simple way to avoid the path hiding problem noted in <xref
target="path_hiding" />, this approach requires significantly more
computing resources on the route server than where a single Loc-RIB
is deployed for all clients. As the <xref target="RFC4271" /> BGP
decision process must be applied to all Loc-RIBs deployed on the
route server, both CPU and memory requirements on the host computer
scale approximately according to O(P * N), where P is the total
number of unique paths received by the route server and N is the
number of route server clients which require a unique Loc-RIB. As
this is a super-linear scaling relationship, large route servers may
derive benefit from deploying per-client Loc-RIBs only where they
are required.
</t>
<t>
Regardless of whether any Loc-RIB optimization technique is
implemented, the route server's theoretical upper-bound network
bandwidth requirements will scale according to O(P_tot * N), where
P_tot is the total number of unique paths received by the route
server and N is the total number of route server clients. In the
case where P_avg (the arithmetic mean number of unique paths
received per route server client) remains roughly constant even as
the number of connected clients increases, the total number of
prefixes will equal the average number of prefixes multiplied by
the number of clients. Symbolically, this can be written as P_tot
= P_avg * N. If we assume that in the worst case, each prefix is
associated with a different set of BGP path attributes, so must be
transmitted individually, the network bandwidth scaling function
can be rewritten as O((P_avg * N) * N) or O(N^2). This quadratic
upper bound on the network traffic requirements indicates that the
route server model may not scale well for larger numbers of
clients.
</t>
<t>
In practice, most prefixes will be associated with a limited
number of BGP path attribute sets, allowing more efficient
transmission of BGP routes from the route server than the
theoretical analysis suggests. In the analysis above, P_tot will
increase monotonically according to the number of clients, but
will have an upper limit of the size of the full default-free
routing table of the network in which the IXP is located.
Observations from production route servers have shown that most
route server clients generally avoid using custom routing policies
and consequently the route server may not need to deploy
per-client Loc-RIBs. These practical bounds reduce the
theoretical worst-case scaling scenario to the point where
route-server deployments are manageable even on larger IXPs.
</t>
<section title="Tackling Scaling Issues">
<t>
The problem of scaling route servers still presents serious
practical challenges and requires careful attention. Scaling
analysis indicates problems in three key areas: route processor
CPU overhead associated with BGP decision process calculations,
the memory requirements for handling many different BGP path
entries, and the network traffic bandwidth required to
distribute these BGP routes from the route server to each route
server client.
</t>
<section title="View Merging and Decomposition">
<t>
View merging and decomposition, outlined in <xref
target="RS-ARCH" />, describes a method of optimising memory
and CPU requirements where multiple route server clients are
subject to exactly the same routing policies. In this
situation, multiple Loc-RIB views can be merged into a single
view.
</t>
<t>
There are several variations of this approach. If the route
server operator has prior knowledge of interconnection
relationships between route server clients, then the operator
may configure separate Loc-RIBs only for route server clients
with unique routing policies. As this approach
requires prior knowledge of interconnection relationships, the
route server operator must depend on each client sharing their
interconnection policies, either in a internal provisioning
database controlled by the operator, or else in an external
data store such as an Internet Routing Registry Database.
</t>
<t>
Conversely, the route server implementation itself may implement
internal view decomposition by creating virtual Loc-RIBs based
on a single in-memory master Loc-RIB, with delta differences for
each prefix subject to different routing policies. This allows a
more fine-grained and flexible approach to the problem of Loc-RIB
scaling, at the expense of requiring a more complex in-memory
Loc-RIB structure.
</t>
<t>
Whatever method of view merging and decomposition is chosen on
a route server, pathological edge cases can be created whereby
they will scale no better than fully non-optimised per-client
Loc-RIBs. However, as most route server clients connect to a
route server for the purposes of reducing overhead, rather
than implementing complex per-client routing policies, edge
cases tend not to arise in practice.
</t>
</section>
<section title="Destination Splitting">
<t>
Destination splitting, also described in <xref
target="RS-ARCH" />, describes a method for route server
clients to connect to multiple route servers and to send
non-overlapping sets of prefixes to each route server. As
each route server computes the best path for its own set of
prefixes, the quadratic scaling requirement operates on
multiple smaller sets of prefixes. This reduces the overall
computational and memory requirements for managing multiple
Loc-RIBs and performing the best-path calculation on each.
</t>
<t>
In practice, the route server operator would need all route
server clients to send a full set of BGP routes to each route
server. The route server operator could then selectively
filter these prefixes for each route server by using either
BGP Outbound Route Filtering <xref target="RFC5291" /> or else
inbound prefix filters configured on client BGP sessions.
</t>
</section>
<section title="NEXT_HOP Resolution">
<t>
As route servers are usually deployed at IXPs where all
connected routers are on the same layer 2 broadcast domain,
recursive resolution of the NEXT_HOP attribute is generally
not required, and can be replaced by a simple check to ensure
that the NEXT_HOP value for each received BGP route is a
network address on the IXP LAN's IP address range.
</t>
</section>
</section>
</section>
<section title="Prefix Leakage Mitigation" anchor="ops_leakage_mitigation">
<t>
Prefix leakage occurs when a BGP client unintentionally distributes
BGP routes to one or more neighboring BGP routers. Prefix
leakage of this form to a route server can cause serious
connectivity problems at an IXP if each route server client is
configured to accept all BGP routes from the route
server. It is therefore RECOMMENDED when deploying route servers
that, due to the potential for collateral damage caused by BGP route
leakage, route server operators deploy prefix leakage mitigation
measures in order to prevent unintentional prefix announcements or
else limit the scale of any such leak. Although not foolproof,
per-client inbound prefix limits can restrict the damage caused by
prefix leakage in many cases. Per-client inbound prefix filtering on
the route server is a more deterministic and usually more reliable
means of preventing prefix leakage, but requires more administrative
resources to maintain properly.
</t>
<t>
If a route server operator implements per-client inbound prefix
filtering, then it is RECOMMENDED that the operator also builds in
mechanisms to automatically compare the Adj-RIB-In received from
each client with the inbound prefix lists configured for those
clients. Naturally, it is the responsibility of the route server
client to ensure that their stated prefix list is compatible with
what they announce to an IXP route server. However, many network
operators do not carefully manage their published routing policies
and it is not uncommon to see significant variation between the
two sets of prefixes. Route server operator visibility into this
discrepancy can provide significant advantages to both operator
and client.
</t>
</section>
<section title="Route Server Redundancy" anchor="ops_use_two_of_em">
<t>
As the purpose of an IXP route server implementation is to provide
a reliable reachability brokerage service, it is RECOMMENDED that
exchange operators who implement route server systems provision
multiple route servers on each shared Layer-2 domain. There is no
requirement to use the same BGP implementation or operating system
for each route server on the IXP fabric; however, it is
RECOMMENDED that where an operator provisions more than a single
server on the same shared Layer-2 domain, each route server
implementation be configured equivalently and in such a manner
that the path reachability information from each system is
identical.
</t>
</section>
<section title="AS_PATH Consistency Check">
<t>
<xref target="RFC4271" /> requires that every BGP speaker which
advertises a BGP route to another external BGP speaker prepends its
own AS number as the last element of the AS_PATH sequence.
Therefore the leftmost AS in an AS_PATH attribute should be equal
to the autonomous system number of the BGP speaker which sent the
BGP route.
</t>
<t>
As <xref target="I-D.ietf-idr-ix-bgp-route-server" /> suggests
that route servers should not modify the AS_PATH attribute, a
consistency check on the AS_PATH of an BGP route received by a route
server client would normally fail. It is therefore RECOMMENDED
that route server clients disable the AS_PATH consistency check
towards the route server.
</t>
</section>
<section title="Export Routing Policies">
<t>
Policy filtering is commonly implemented on route servers to
provide prefix distribution control mechanisms for route server
clients. A route server "export" policy is a policy which affects
prefixes sent from the route server to a route server client.
Several different strategies are commonly used for implementing
route server export policies.
</t>
<section title="BGP Communities">
<t>
Prefixes sent to the route server are tagged with specific
standard <xref target="RFC1997" /> or extended <xref
target="RFC4360" /> BGP community attributes, based on
pre-defined values agreed between the operator and all clients.
Based on these community tags, BGP routes may be propagated to
all other clients, a subset of clients, or none. This mechanism
allows route server clients to instruct the route server to
implement per-client export routing policies.
</t>
<t>
As both standard and extended BGP community values are currently
restricted to 6 octets or fewer, it is not possible for both the
global and local administrator fields in the BGP community to
fit a 4-octet autonomous system number. Bearing this in mind,
the route server operator SHOULD take care to ensure that the
predefined BGP community values mechanism used on their route
server is compatible with <xref target="RFC4893"/> 4-octet ASNs.
</t>
</section>
<section title="Internet Routing Registries">
<t>
Internet Routing Registry databases (IRRDBs) may be used by route
server operators to construct per-client routing
policies. <xref target="RFC2622" /> Routing Policy Specification
Language (RPSL) provides an comprehensive grammar for describing
interconnection relationships, and several toolsets exist which
can be used to translate RPSL policy description into route server
configurations.
</t>
</section>
<section title="Client-accessible Databases">
<t>
Should the route server operator not wish to use either BGP
community tags or the public IRRDBs for implementing client
export policies, they may implement their own routing policy
database system for managing their clients' requirements. A
database of this form SHOULD allow a route server client
operator to update their routing policy and provide a mechanism
for allowing the client to specify whether they wish to exchange
all their prefixes with any other route server client.
Optionally, the implementation may allow a client to specify
unique routing policies for individual prefixes over which they
have routing policy control.
</t>
</section>
</section>
<section title="Layer 2 Reachability Problems">
<t>
Layer 2 reachability problems on an IXP can cause serious
operational problems for IXP participants which depend on route
servers for interconnection. Ethernet switch forwarding bugs have
occasionally been observed to cause non-transitive reachability.
For example, given a route server and two IXP participants, A and B,
if the two participants can reach the route server but cannot reach
each other, then traffic between the participants may be dropped
until such time as the layer 2 forwarding problem is resolved. This
situation does not tend to occur in bilateral interconnection
arrangements, as the routing control path between the two hosts is
usually (but not always, due to IXP inter-switch connectivity load
balancing algorithms) the same as the data path between them.
</t>
<t>
Problems of this form can be partially mitigated by using <xref
target="RFC5881" /> bidirectional forwarding detection. However,
as this is a bilateral protocol configured between routers, and as
there is currently no protocol to automatically configure BFD
sessions between route server clients, BFD does not currently
provide an optimal means of handling the problem. Even if
automatic BFD session configuration were possible, practical
problems would remain. If two IXP route server clients were
configured to run BFD between each other and the protocol detected
a non-transitive loss of reachability between them, each of those
routers would internally mark the other's prefixes as unreachable
via the BGP path announced by the route server. As the route
server only propagates a single best path to each client, this
could cause either sub-optimal routing or complete connectivity
loss if there were no alternative paths learned from other BGP
sessions.
</t>
</section>
<section title="BGP NEXT_HOP Hijacking" anchor="nh_hijack">
<t>
Section 5.1.3(2) of <xref target="RFC4271" /> allows eBGP speakers
to change the NEXT_HOP address of a received BGP route to be a different
internet address on the same subnet. This is the mechanism which
allows route servers to operate on a shared layer 2 IXP network.
However, the mechanism can be abused by route server clients to
redirect traffic for their prefixes to other IXP participant
routers.
</t>
<figure title="BGP NEXT_HOP Hijacking using a Route Server" anchor="pic_nh_hijack">
<preamble></preamble>
<artwork align="center">
____
/ \
| AS99 |
\____/
/ \
/ \
__/ \__
/ \ / \
..| AS1 |..| AS2 |..
: \___/ \___/ :
: \ / :
: \ / :
: \__/ :
: IXP / \ :
: | RS | :
: \____/ :
: :
....................
</artwork>
<postamble></postamble>
</figure>
<t>
For example in <xref target="pic_nh_hijack" />, if AS1 and AS2
both announce BGP routes for AS99 to the route server, AS1 could set
the NEXT_HOP address for AS99's routes to be the address of
AS2's router, thereby diverting traffic for AS99 via AS2. This
may override the routing policies of AS99 and AS2.
</t>
<t>
Worse still, if the route server operator does not use inbound
prefix filtering, AS1 could announce any arbitrary prefix to the
route server with a NEXT_HOP address of any other IXP
participant. This could be used as a denial of service mechanism
against either the users of the address space being announced by
illicitly diverting their traffic, or the other IXP participant by
overloading their network with traffic which would not normally be
sent there.
</t>
<t>
This problem is not specific to route servers and it can also be
implemented using bilateral BGP sessions. However, the
potential damage is amplified by route servers because a single
BGP session can be used to affect many networks simultaneously.
</t>
<t>
Because route server clients cannot easily implement next-hop
policy checks against route server BGP sessions, route server
operators SHOULD check that the BGP NEXT_HOP attribute for BGP
routes received from a route server client matches the interface
address of the client. If the route server receives an BGP route
where these addresses are different and where the announcing route
server client is in a different autonomous system to the route
server client which uses the next hop address, the BGP route
SHOULD be dropped. Permitting next-hop rewriting for the same
autonomous system allows an organisation with multiple connections
into an IXP configured with different IP addresses to direct
traffic off the IXP infrastructure through any of their
connections for traffic engineering or other purposes.
</t>
</section>
</section>
<section title="Security Considerations">
<t>
On route server installations which do not employ path hiding
mitigation techniques, the path hiding problem outlined in <xref
target="path_hiding" /> could be used by an IXP participant to
prevent the route server from sending any BGP routes for a
particular prefix to other route server clients, even if there were
a valid path to that destination via another route server client.
</t>
<t>
If the route server operator does not implement prefix leakage
mitigation as described in <xref target="ops_leakage_mitigation" />,
it is trivial for route server clients to implement denial of
service attacks against arbitrary Internet networks by leaking BGP
routes to a route server.
</t>
<t>
Route server installations SHOULD be secured against BGP NEXT_HOP
hijacking, as described in <xref target="nh_hijack" />.
</t>
</section>
<section title="IANA Considerations">
<t>
There are no IANA considerations.
</t>
</section>
<section title="Acknowledgments">
<t>
The authors would like to thank Chris Hall, Ryan Bickhart, Steven
Bakker and Eduardo Ascenço Reis for their valuable input.
</t>
</section>
</middle>
<back>
<references title="Normative References">
<?rfc include="reference.I-D.ietf-idr-ix-bgp-route-server"?>
<?rfc include="reference.RFC.2119"?> <!-- key words -->
</references>
<references title="Informative References">
<?rfc include="reference.RFC.1997"?> <!-- BGP Communities -->
<?rfc include="reference.RFC.2622"?> <!-- RPSL -->
<?rfc include="reference.RFC.4271"?> <!-- BGP-4 -->
<?rfc include="reference.RFC.4360"?> <!-- Extended Communities -->
<?rfc include="reference.RFC.4456"?> <!-- Route Reflector IBGP -->
<?rfc include="reference.RFC.4893"?> <!-- BGP Support for Four-octet AS Number Space -->
<?rfc include="reference.RFC.5291"?> <!-- Outbound Route Filtering Capability for BGP-4 -->
<?rfc include="reference.RFC.5881"?> <!-- BFD -->
<reference anchor="RS-ARCH"
target="http://www.cs.usc.edu/assets/003/83191.pdf">
<front>
<title>A Route Server Architecture for Inter-Domain Routing</title>
<author initials="R" surname="Govindan" fullname="Ramesh Govindan">
<organization>The University of Southern California</organization>
</author>
<author initials="C" surname="Alaettinoglu" fullname="Cengiz Alaettinoglu">
<organization>The University of Southern California</organization>
</author>
<author initials="K" surname="Varadhan" fullname="Kannan Varadhan">
<organization>The University of Southern California</organization>
</author>
<author initials="D" surname="Estrin" fullname="Deborah Estrin">
<organization>The University of Southern California</organization>
</author>
<date year="1995" />
</front>
</reference>
</references>
</back>
</rfc>