内核将未缓存的IPv6路由项组成一个链表rt6_uncached_list,其为一个每处理器变量。
struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
在IPv6路由初始化函数中,初始化rt6_uncached_list链表头和自旋锁。
int __init ip6_route_init(void) { for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); }
函数rt6_uncached_list_add和函数rt6_uncached_list_del分别用于将路由缓存rt添加到rt6_uncached_list链表上,或者从链表中删除。
void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); rt->rt6i_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->rt6i_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->rt6i_uncached)) { struct uncached_list *ul = rt->rt6i_uncached_list; struct net *net = dev_net(rt->dst.dev); spin_lock_bh(&ul->lock); list_del(&rt->rt6i_uncached); atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); spin_unlock_bh(&ul->lock); } }
在IPv6路由缓存分配函数中,初始化其rt6i_uncached链表指针。
static void rt6_info_init(struct rt6_info *rt) { struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); INIT_LIST_HEAD(&rt->rt6i_uncached); } /* allocate dst with ip6_dst_ops */ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); } return rt; }
与IPv4不同,IPv6的出口路由和入口路由都使用函数ip6_pol_route实现,区别在于传入的接口索引参数不同。
INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); }
首先,函数rt6_find_cached_rt在fib查询结果的exception表中查找缓存的路由,如果找到,则返回此值。
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct rt6_info *rt = NULL; WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && !rcu_read_lock_held()); strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); if (res.f6i == net->ipv6.fib6_null_entry) goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
否则,判断流结构flowi6是否设置了FLOWI_FLAG_KNOWN_NH,并且没有设置了下一跳网关的地址组,这种已知下一跳的前提下查找路由的情况不常见。而且,由于在fl6结构目的地址成员daddr使用的是下一跳地址,而不是skb报文中的目的地址,此时创建的路由缓存项不会缓存在fib6树种,将其添加到uncached_list链表。
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); if (rt) { /* 1 refcnt is taken during ip6_rt_cache_alloc(). * As rt6_uncached_list_add() does not consume refcnt, * this refcnt is always returned to the caller even * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ rt6_uncached_list_add(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); rcu_read_unlock(); return rt; }
如果以上两种情况都没有成立,分配每处理器路由缓存项,其过程中将缓存路由项,不必加到uncached_list链表。
} else { /* Get a percpu copy */ local_bh_disable(); rt = rt6_get_pcpu_route(&res); if (!rt) rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); }
例如,对于SRv6,其中路由查询前,如果下一跳地址有效,流结构的目的地设置为下一跳地址,并且设置FLOWI_FLAG_KNOWN_NH标志。
static int seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id, bool local_delivery) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); int flags = RT6_LOOKUP_F_HAS_SADDR; struct dst_entry *dst = NULL; struct rt6_info *rt; struct flowi6 fl6; int dev_flags = 0; fl6.flowi6_iif = skb->dev->ifindex; fl6.daddr = nhaddr ? *nhaddr : hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; if (nhaddr) fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; if (!tbl_id) { dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags); } else { struct fib6_table *table; table = fib6_get_table(net, tbl_id); if (!table) goto out; rt = ip6_pol_route(net, table, 0, &fl6, skb, flags); dst = &rt->dst;
对于使用ICMPv6的IPv6邻居发现、IGMP和MLD协议,利用icmp6_dst_alloc分配路由缓存项。对于这类报文,仅限于本地网络,报文的下一跳地址和目的地址相同,这里不查询fib6表,直接分配缓存项,导致新分配的路由在fib树中没有缓存位置,所以将其添加到uncached_list链表。
struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { struct dst_entry *dst; struct rt6_info *rt; struct inet6_dev *idev = in6_dev_get(dev); struct net *net = dev_net(dev); if (unlikely(!idev)) return ERR_PTR(-ENODEV); rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); goto out; } rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
当接口被注销或者down时,由函数rt6_uncached_list_flush_dev清除设备相关的uncached路由缓存。
static int addrconf_ifdown(struct net_device *dev, bool unregister) { unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN; struct net *net = dev_net(dev); struct inet6_dev *idev; rt6_disable_ip(dev, event); void rt6_disable_ip(struct net_device *dev, unsigned long event) { rt6_sync_down_dev(dev, event); rt6_uncached_list_flush_dev(dev_net(dev), dev); neigh_ifdown(&nd_tbl, dev); }
遍历所有的rt6_uncached_list中的路由缓存,将其中与操作设备相等的缓存项的设备换成黑洞设备blackhole_netdev,并且将路由项的inet6_dev换成回环接口对应的inet6_dev。
可见,实际上并没有将路由缓存项从uncached_list链表中删除。
static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { struct net_device *loopback_dev = net->loopback_dev; if (dev == loopback_dev) return; for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct rt6_info *rt; spin_lock_bh(&ul->lock); list_for_each_entry(rt, &ul->head, rt6i_uncached) { struct inet6_dev *rt_idev = rt->rt6i_idev; struct net_device *rt_dev = rt->dst.dev; if (rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(loopback_dev); in6_dev_put(rt_idev); } if (rt_dev == dev) { rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(rt_dev); }
在销毁路由缓存时,由函数rt6_uncached_list_del检测其是否在uncached_list链表上,为真将其移除,并且递减net->ipv6.rt6_stats->fib_rt_uncache计数,这是在rt6_uncached_list_del内部完成。与此不同,此计数的递增是在rt6_uncached_list_add外部完成。
static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; struct fib6_info *from; struct inet6_dev *idev; ip_dst_metrics_put(dst); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); } from = xchg((__force struct fib6_info **)&rt->from, NULL); fib6_info_release(from); }
对于出口路由查找,按照代码中的注释,如果路由缓存已经加入uncached_list链表,说明缓存引用计数已经递增。但是,uncached_list的添加并没有增加路由缓存引用计数,只有在初始分配时设置的引用计数1。
对于uncached的路由缓存,引用计数为1即可,在使用完成之后执行释放操作,这里不需要再次增加引用计数,感觉这个注释有问题,但是代码逻辑并没有问题。对于需要缓存的路由,执行dst_hold_safe增加引用计数,计数应当为2,这样在使用完之后,进行释放时,将计数减为1,并没有实际释放,路由缓存还在。
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { struct dst_entry *dst; struct rt6_info *rt6; rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); rt6 = (struct rt6_info *)dst; /* For dst cached in uncached_list, refcnt is already taken. */ if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; dst_hold(dst); }
以下路由合法性检测函数ip6_dst_check,如果路由缓存项具有RTF_PCPU标志,其可能在函数ip6_rt_pcpu_alloc中设置,在同时设置了缓存的from成员(fib6_info结构)时调用rt6_dst_from_check执行检查。另外,根据之前的介绍,有两种情况路将缓存加到了uncached_list链表上:一是ICMPv6报文发送时生成的缓存;二是已知下一跳地址的情况下创建的路由缓存,对于第一种情况,路由缓存直接生成,from成员为空,调用rt6_check进行检查;对于第二种情况调用rt6_dst_from_check检查。
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct dst_entry *dst_ret; struct fib6_info *from; struct rt6_info *rt; rt = container_of(dst, struct rt6_info, dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ from = rcu_dereference(rt->from); if (from && (rt->rt6i_flags & RTF_PCPU || unlikely(!list_empty(&rt->rt6i_uncached)))) dst_ret = rt6_dst_from_check(rt, from, cookie); else dst_ret = rt6_check(rt, from, cookie); rcu_read_unlock(); return dst_ret; }
如果路由缓存没有设置RT6_LOOKUP_F_DST_NOREF标志,即其使用了引用计数,由函数ip6_rt_put递减计数。另外,如果缓存位于uncached_list链表上,表明不需要缓存此路由,也执行引用计数的递减。
/* Only conditionally release dst if flags indicates * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list. */ static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags) { if (!(flags & RT6_LOOKUP_F_DST_NOREF) || !list_empty(&rt->rt6i_uncached)) ip6_rt_put(rt); }
内核版本 5.10