Skip to content

Commit

Permalink
tcp/dccp: install syn_recv requests into ehash table
Browse files Browse the repository at this point in the history
In this patch, we insert request sockets into TCP/DCCP
regular ehash table (where ESTABLISHED and TIMEWAIT sockets
are) instead of using the per listener hash table.

ACK packets find SYN_RECV pseudo sockets without having
to find and lock the listener.

In nominal conditions, this halves pressure on listener lock.

Note that this will allow for SO_REUSEPORT refinements,
so that we can select a listener using cpu/numa affinities instead
of the prior 'consistent hash', since only SYN packets will
apply this selection logic.

We will shrink listen_sock in the following patch to ease
code review.

Signed-off-by: Eric Dumazet <[email protected]>
Cc: Ying Cai <[email protected]>
Cc: Willem de Bruijn <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
Eric Dumazet authored and davem330 committed Oct 3, 2015
1 parent 2feda34 commit 079096f
Show file tree
Hide file tree
Showing 15 changed files with 160 additions and 501 deletions.
4 changes: 0 additions & 4 deletions include/net/inet_connection_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk,

struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);

struct request_sock *inet_csk_search_req(struct sock *sk,
const __be16 rport,
const __be32 raddr,
const __be32 laddr);
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax);
int inet_csk_get_port(struct sock *sk, unsigned short snum);
Expand Down
1 change: 1 addition & 0 deletions include/net/inet_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ void inet_put_port(struct sock *sk);

void inet_hashinfo_init(struct inet_hashinfo *h);

int inet_ehash_insert(struct sock *sk, struct sock *osk);
void __inet_hash_nolisten(struct sock *sk, struct sock *osk);
void __inet_hash(struct sock *sk, struct sock *osk);
void inet_hash(struct sock *sk);
Expand Down
4 changes: 0 additions & 4 deletions include/net/request_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,4 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log;
}

void reqsk_queue_hash_req(struct request_sock_queue *queue,
u32 hash, struct request_sock *req,
unsigned long timeout);

#endif /* _REQUEST_SOCK_H */
3 changes: 0 additions & 3 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1618,7 +1618,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
/* /proc */
enum tcp_seq_states {
TCP_SEQ_STATE_LISTENING,
TCP_SEQ_STATE_OPENREQ,
TCP_SEQ_STATE_ESTABLISHED,
};

Expand Down Expand Up @@ -1717,8 +1716,6 @@ struct tcp_request_sock_ops {
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct request_sock *req,
u16 queue_mapping, struct tcp_fastopen_cookie *foc);
void (*queue_hash_add)(struct sock *sk, struct request_sock *req,
const unsigned long timeout);
};

#ifdef CONFIG_SYN_COOKIES
Expand Down
28 changes: 1 addition & 27 deletions net/core/request_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,35 +99,9 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk(

void reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);

if (reqsk_queue_len(queue) != 0) {
unsigned int i;

for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req;

spin_lock_bh(&queue->syn_wait_lock);
while ((req = lopt->syn_table[i]) != NULL) {
lopt->syn_table[i] = req->dl_next;
/* Because of following del_timer_sync(),
* we must release the spinlock here
* or risk a dead lock.
*/
spin_unlock_bh(&queue->syn_wait_lock);
atomic_dec(&queue->qlen);
if (del_timer_sync(&req->rsk_timer))
reqsk_put(req);
reqsk_put(req);
spin_lock_bh(&queue->syn_wait_lock);
}
spin_unlock_bh(&queue->syn_wait_lock);
}
}

if (WARN_ON(reqsk_queue_len(queue) != 0))
pr_err("qlen %u\n", reqsk_queue_len(queue));
/* cleaning is done by req timers */
kvfree(lopt);
}

Expand Down
64 changes: 21 additions & 43 deletions net/dccp/ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -444,36 +444,6 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
}
EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);

static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport,
iph->saddr, iph->daddr);
if (req) {
nsk = dccp_check_req(sk, skb, req);
if (!nsk)
reqsk_put(req);
return nsk;
}
nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
iph->saddr, dh->dccph_sport,
iph->daddr, dh->dccph_dport,
inet_iif(skb));
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}

return sk;
}

static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
Expand Down Expand Up @@ -705,26 +675,13 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
* NOTE: the check for the packet types is done in
* dccp_rcv_state_process
*/
if (sk->sk_state == DCCP_LISTEN) {
struct sock *nsk = dccp_v4_hnd_req(sk, skb);

if (nsk == NULL)
goto discard;

if (nsk != sk) {
if (dccp_child_process(sk, nsk, skb))
goto reset;
return 0;
}
}

if (dccp_rcv_state_process(sk, skb, dh, skb->len))
goto reset;
return 0;

reset:
dccp_v4_ctl_send_reset(sk, skb);
discard:
kfree_skb(skb);
return 0;
}
Expand Down Expand Up @@ -868,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb)
goto no_dccp_socket;
}

if (sk->sk_state == DCCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;

sk = req->rsk_listener;
if (sk->sk_state == DCCP_LISTEN)
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
dccp_v4_ctl_send_reset(sk, skb);
goto discard_it;
} else {
return 0;
}
}
/*
* RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
* o if MinCsCov = 0, only packets with CsCov = 0 are accepted
Expand Down
72 changes: 22 additions & 50 deletions net/dccp/ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = {
.syn_ack_timeout = dccp_syn_ack_timeout,
};

static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct request_sock *req;
struct sock *nsk;

req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr,
&iph->daddr, inet6_iif(skb));
if (req) {
nsk = dccp_check_req(sk, skb, req);
if (!nsk)
reqsk_put(req);
return nsk;
}
nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
&iph->saddr, dh->dccph_sport,
&iph->daddr, ntohs(dh->dccph_dport),
inet6_iif(skb));
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}

return sk;
}

static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct request_sock *req;
Expand Down Expand Up @@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (dccp_v6_send_response(sk, req))
goto drop_and_free;

inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
return 0;

drop_and_free:
Expand Down Expand Up @@ -641,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
* NOTE: the check for the packet types is done in
* dccp_rcv_state_process
*/
if (sk->sk_state == DCCP_LISTEN) {
struct sock *nsk = dccp_v6_hnd_req(sk, skb);

if (nsk == NULL)
goto discard;
/*
* Queue it on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
if (nsk != sk) {
if (dccp_child_process(sk, nsk, skb))
goto reset;
if (opt_skb != NULL)
__kfree_skb(opt_skb);
return 0;
}
}

if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
goto reset;
Expand Down Expand Up @@ -732,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb)
goto no_dccp_socket;
}

if (sk->sk_state == DCCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;

sk = req->rsk_listener;
if (sk->sk_state == DCCP_LISTEN)
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
dccp_v6_ctl_send_reset(sk, skb);
goto discard_it;
} else {
return 0;
}
}
/*
* RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
* o if MinCsCov = 0, only packets with CsCov = 0 are accepted
Expand Down
Loading

0 comments on commit 079096f

Please sign in to comment.