diff options
| -rw-r--r-- | include/linux/bpf.h | 50 | ||||
| -rw-r--r-- | include/linux/filter.h | 3 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 25 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 12 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 16 | ||||
| -rw-r--r-- | net/ipv4/ip_output.c | 34 | ||||
| -rw-r--r-- | net/ipv6/ip6_output.c | 26 | ||||
| -rwxr-xr-x | samples/bpf/do_hbm_test.sh | 10 | ||||
| -rw-r--r-- | samples/bpf/hbm.c | 51 | ||||
| -rw-r--r-- | samples/bpf/hbm.h | 9 | ||||
| -rw-r--r-- | samples/bpf/hbm_kern.h | 66 | ||||
| -rw-r--r-- | samples/bpf/hbm_out_kern.c | 48 |
12 files changed, 299 insertions, 51 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ff3e00ff84d2..2cc58fc0f413 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -552,6 +552,56 @@ _out: \ _ret; \ }) +/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs + * so BPF programs can request cwr for TCP packets. + * + * Current cgroup skb programs can only return 0 or 1 (0 to drop the + * packet. This macro changes the behavior so the low order bit + * indicates whether the packet should be dropped (0) or not (1) + * and the next bit is a congestion notification bit. This could be + * used by TCP to call tcp_enter_cwr() + * + * Hence, new allowed return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * This macro then converts it to one of the NET_XMIT or an error + * code that is then interpreted as drop packet (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -EPERM skb should be dropped + */ +#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ + ({ \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ + struct bpf_prog_array *_array; \ + u32 ret; \ + u32 _ret = 1; \ + u32 _cn = 0; \ + preempt_disable(); \ + rcu_read_lock(); \ + _array = rcu_dereference(array); \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + ret = func(_prog, ctx); \ + _ret &= (ret & 1); \ + _cn |= (ret & 2); \ + _item++; \ + } \ + rcu_read_unlock(); \ + preempt_enable(); \ + if (_ret) \ + _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ + else \ + _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ + _ret; \ + }) + #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ __BPF_PROG_RUN_ARRAY(array, ctx, func, false) diff --git a/include/linux/filter.h b/include/linux/filter.h index ba8b65270e0d..43b45d6db36d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -526,7 +526,8 @@ struct bpf_prog { blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ - has_callchain_buf:1; /* callchain buffer allocated? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ff594eb86fd7..1b65ab0df457 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3d546b6f4646..1539774d78c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SKB: + switch (expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2778417e6e0c..5c2cb5bd84ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { + struct tnum enforce_attach_type_range = tnum_unknown; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { + range = tnum_range(0, 3); + enforce_attach_type_range = tnum_range(2, 3); + } case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: @@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been %s\n", tn_buf); return -EINVAL; } + + if (!tnum_is_unknown(enforce_attach_type_range) && + tnum_in(enforce_attach_type_range, reg->var_off)) + env->prog->enforce_expected_attach_type = 1; return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bfd0ca554977..1217a53381c2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, return ret; } -static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -315,18 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { + switch (ret) { + case NET_XMIT_SUCCESS: + return dev_loopback_xmit(net, sk, skb); + case NET_XMIT_CN: + return dev_loopback_xmit(net, sk, skb) ? : ret; + default: kfree_skb(skb); return ret; } - - return dev_loopback_xmit(net, sk, skb); } int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index adef2236abe2..a75bc21d8c88 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -128,16 +128,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } -static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } - #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -154,6 +146,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip6_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip6_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh index 56c8b4115c95..e48b047d4646 100755 --- a/samples/bpf/do_hbm_test.sh +++ b/samples/bpf/do_hbm_test.sh @@ -13,10 +13,10 @@ Usage() { echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" echo "loads. The output is the goodput in Mbps (unless -D was used)." echo "" - echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]" - echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]" + echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]" + echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E]" echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" - echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]" + echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]" echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" echo " Where:" @@ -33,6 +33,7 @@ Usage() { echo " -f or --flows number of concurrent flows (default=1)" echo " -i or --id cgroup id (an integer, default is 1)" echo " -N use netperf instead of iperf3" + echo " --no_cn Do not return CN notifications" echo " -l do not limit flows using loopback" echo " -h Help" echo " -p or --port iperf3 port (default is 5201)" @@ -115,6 +116,9 @@ processArgs () { -c=*|--cc=*) cc="${i#*=}" ;; + --no_cn) + flags="$flags --no_cn" + ;; --debug) flags="$flags -d" debug_flag=1 diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index a79828ab273f..480b7ad6a1f2 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -16,6 +16,7 @@ * -l Also limit flows doing loopback * -n <#> To create cgroup \"/hbm#\" and attach prog * Default is /hbm1 + * --no_cn Do not return cn notifications * -r <rate> Rate limit in Mbps * -s Get HBM stats (marked, dropped, etc.) * -t <time> Exit after specified seconds (default is 0) @@ -42,6 +43,7 @@ #include <linux/bpf.h> #include <bpf/bpf.h> +#include <getopt.h> #include "bpf_load.h" #include "bpf_rlimit.h" @@ -59,6 +61,7 @@ bool stats_flag; bool loopback_flag; bool debugFlag; bool work_conserving_flag; +bool no_cn_flag; static void Usage(void); static void read_trace_pipe2(void); @@ -185,6 +188,7 @@ static int run_bpf_prog(char *prog, int cg_id) qstats.rate = rate; qstats.stats = stats_flag ? 1 : 0; qstats.loopback = loopback_flag ? 1 : 0; + qstats.no_cn = no_cn_flag ? 1 : 0; if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { printf("ERROR: Could not update map element\n"); goto err; @@ -312,6 +316,14 @@ static int run_bpf_prog(char *prog, int cg_id) double percent_pkts, percent_bytes; char fname[100]; FILE *fout; + int k; + static const char *returnValNames[] = { + "DROP_PKT", + "ALLOW_PKT", + "DROP_PKT_CWR", + "ALLOW_PKT_CWR" + }; +#define RET_VAL_COUNT 4 // Future support of ingress // if (!outFlag) @@ -346,6 +358,31 @@ static int run_bpf_prog(char *prog, int cg_id) (qstats.bytes_total + 1); fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); + + // ECN CE markings + percent_pkts = (qstats.pkts_ecn_ce * 100.0) / + (qstats.pkts_total + 1); + fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, + (int)qstats.pkts_ecn_ce); + + // Average cwnd + fprintf(fout, "avg cwnd:%d\n", + (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); + // Average rtt + fprintf(fout, "avg rtt:%d\n", + (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); + // Average credit + fprintf(fout, "avg credit:%d\n", + (int)(qstats.sum_credit / + (1500 * ((int)qstats.pkts_total) + 1))); + + // Return values stats + for (k = 0; k < RET_VAL_COUNT; k++) { + percent_pkts = (qstats.returnValCount[k] * 100.0) / + (qstats.pkts_total + 1); + fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], + percent_pkts, (int)qstats.returnValCount[k]); + } fclose(fout); } @@ -366,14 +403,15 @@ static void Usage(void) { printf("This program loads a cgroup skb BPF program to enforce\n" "cgroup output (egress) bandwidth limits.\n\n" - "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n" - " [-t <secs>] [-w] [-h] [prog]\n" + "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" + " [-s] [-t <secs>] [-w] [-h] [prog]\n" " Where:\n" " -o indicates egress direction (default)\n" " -d print BPF trace debug buffer\n" " -l also limit flows using loopback\n" " -n <#> to create cgroup \"/hbm#\" and attach prog\n" " Default is /hbm1\n" + " --no_cn disable CN notifcations\n" " -r <rate> Rate in Mbps\n" " -s Update HBM stats\n" " -t <time> Exit after specified seconds (default is 0)\n" @@ -393,9 +431,16 @@ int main(int argc, char **argv) int k; int cg_id = 1; char *optstring = "iodln:r:st:wh"; + struct option loptions[] = { + {"no_cn", 0, NULL, 1}, + {NULL, 0, NULL, 0} + }; - while ((k = getopt(argc, argv, optstring)) != -1) { + while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { switch (k) { + case 1: + no_cn_flag = true; + break; case'o': break; case 'd': diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h index 518e8147d084..f0963ed6a562 100644 --- a/samples/bpf/hbm.h +++ b/samples/bpf/hbm.h @@ -19,7 +19,8 @@ struct hbm_vqueue { struct hbm_queue_stats { unsigned long rate; /* in Mbps*/ unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ - loopback:1; /* also limit flows using loopback */ + loopback:1, /* also limit flows using loopback */ + no_cn:1; /* do not use cn flags */ unsigned long long pkts_marked; unsigned long long bytes_marked; unsigned long long pkts_dropped; @@ -28,4 +29,10 @@ struct hbm_queue_stats { unsigned long long bytes_total; unsigned long long firstPacketTime; unsigned long long lastPacketTime; + unsigned long long pkts_ecn_ce; + unsigned long long returnValCount[4]; + unsigned long long sum_cwnd; + unsigned long long sum_rtt; + unsigned long long sum_cwnd_cnt; + long long sum_credit; }; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h index 41384be233b9..be19cf1d5cd5 100644 --- a/samples/bpf/hbm_kern.h +++ b/samples/bpf/hbm_kern.h @@ -65,17 +65,43 @@ struct bpf_map_def SEC("maps") queue_stats = { BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); struct hbm_pkt_info { + int cwnd; + int rtt; bool is_ip; bool is_tcp; short ecn; }; +static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti) +{ + struct bpf_sock *sk; + struct bpf_tcp_sock *tp; + + sk = skb->sk; + if (sk) { + sk = bpf_sk_fullsock(sk); + if (sk) { + if (sk->protocol == IPPROTO_TCP) { + tp = bpf_tcp_sock(sk); + if (tp) { + pkti->cwnd = tp->snd_cwnd; + pkti->rtt = tp->srtt_us >> 3; + return 0; + } + } + } + } + return 1; +} + static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti) { struct iphdr iph; struct ipv6hdr *ip6h; + pkti->cwnd = 0; + pkti->rtt = 0; bpf_skb_load_bytes(skb, 0, &iph, 12); if (iph.version == 6) { ip6h = (struct ipv6hdr *)&iph; @@ -91,6 +117,8 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, pkti->is_tcp = false; pkti->ecn = 0; } + if (pkti->is_tcp) + get_tcp_info(skb, pkti); } static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) @@ -105,8 +133,14 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, int len, unsigned long long curtime, bool congestion_flag, - bool drop_flag) + bool drop_flag, + bool cwr_flag, + bool ecn_ce_flag, + struct hbm_pkt_info *pkti, + int credit) { + int rv = ALLOW_PKT; + if (qsp != NULL) { // Following is needed for work conserving __sync_add_and_fetch(&(qsp->bytes_total), len); @@ -116,7 +150,7 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, qsp->firstPacketTime = curtime; qsp->lastPacketTime = curtime; __sync_add_and_fetch(&(qsp->pkts_total), 1); - if (congestion_flag || drop_flag) { + if (congestion_flag) { __sync_add_and_fetch(&(qsp->pkts_marked), 1); __sync_add_and_fetch(&(qsp->bytes_marked), len); } @@ -125,6 +159,34 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, __sync_add_and_fetch(&(qsp->bytes_dropped), len); } + if (ecn_ce_flag) + __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1); + if (pkti->cwnd) { + __sync_add_and_fetch(&(qsp->sum_cwnd), + pkti->cwnd); + __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1); + } + if (pkti->rtt) + __sync_add_and_fetch(&(qsp->sum_rtt), + pkti->rtt); + __sync_add_and_fetch(&(qsp->sum_credit), credit); + + if (drop_flag) + rv = DROP_PKT; + if (cwr_flag) + rv |= 2; + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qsp->returnValCount[0]), + 1); + else if (rv == ALLOW_PKT) + __sync_add_and_fetch(&(qsp->returnValCount[1]), + 1); + else if (rv == 2) + __sync_add_and_fetch(&(qsp->returnValCount[2]), + 1); + else if (rv == 3) + __sync_add_and_fetch(&(qsp->returnValCount[3]), + 1); } } } diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c index f806863d0b79..829934bd43cb 100644 --- a/samples/bpf/hbm_out_kern.c +++ b/samples/bpf/hbm_out_kern.c @@ -62,11 +62,12 @@ int _hbm_out_cg(struct __sk_buff *skb) unsigned int queue_index = 0; unsigned long long curtime; int credit; - signed long long delta = 0, zero = 0; + signed long long delta = 0, new_credit; int max_credit = MAX_CREDIT; bool congestion_flag = false; bool drop_flag = false; bool cwr_flag = false; + bool ecn_ce_flag = false; struct hbm_vqueue *qdp; struct hbm_queue_stats *qsp = NULL; int rv = ALLOW_PKT; @@ -99,9 +100,11 @@ int _hbm_out_cg(struct __sk_buff *skb) */ if (delta > 0) { qdp->lasttime = curtime; - credit += CREDIT_PER_NS(delta, qdp->rate); - if (credit > MAX_CREDIT) + new_credit = credit + CREDIT_PER_NS(delta, qdp->rate); + if (new_credit > MAX_CREDIT) credit = MAX_CREDIT; + else + credit = new_credit; } credit -= len; qdp->credit = credit; @@ -119,13 +122,16 @@ int _hbm_out_cg(struct __sk_buff *skb) // Set flags (drop, congestion, cwr) // Dropping => we are congested, so ignore congestion flag if (credit < -DROP_THRESH || - (len > LARGE_PKT_THRESH && - credit < -LARGE_PKT_DROP_THRESH)) { - // Very congested, set drop flag + (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) { + // Very congested, set drop packet drop_flag = true; + if (pkti.ecn) + congestion_flag = true; + else if (pkti.is_tcp) + cwr_flag = true; } else if (credit < 0) { // Congested, set congestion flag - if (pkti.ecn) { + if (pkti.ecn || pkti.is_tcp) { if (credit < -MARK_THRESH) congestion_flag = true; else @@ -136,22 +142,38 @@ int _hbm_out_cg(struct __sk_buff *skb) } if (congestion_flag) { - if (!bpf_skb_ecn_set_ce(skb)) { - if (len > LARGE_PKT_THRESH) { + if (bpf_skb_ecn_set_ce(skb)) { + ecn_ce_flag = true; + } else { + if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (-credit >= MARK_THRESH + + (rand % MARK_REGION_SIZE)) { + // Do congestion control + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { // Problem if too many small packets? drop_flag = true; } } } - if (drop_flag) - rv = DROP_PKT; + if (qsp != NULL) + if (qsp->no_cn) + cwr_flag = false; - hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, + cwr_flag, ecn_ce_flag, &pkti, credit); - if (rv == DROP_PKT) + if (drop_flag) { __sync_add_and_fetch(&(qdp->credit), len); + rv = DROP_PKT; + } + if (cwr_flag) + rv |= 2; return rv; } char _license[] SEC("license") = "GPL"; |
