The reason cause SRIOV VF not be freed from netnamespace

| 分类 Linux  | 标签 netnamespace  sriov 

错误

最近遇到几起由于容器异常停掉,导致容器内的VF设备没有及时归还给给host的情况。

内核错误日志:

[2588629.076977] igb 0000:01:00.1: Setting VLAN 13, QOS 0x0 on VF 2
[2588629.093820] igb 0000:01:00.1: VF 2 attempted to override administratively set VLAN tag
[2588629.093820] Reload the VF driver to resume operations
[2588629.094303] igbvf 0000:01:11.1: Failed to remove vlan id 0
[2588629.094306] failed to kill vid 0081/0 for device eth1

相当于在容器内部执行下面的命令:

#ip link set eth1 down

原因

内核在关闭网络设备时,会调用dev_close,该函数会尝试清除VLAN id,如果网络设备设置了NETIF_F_HW_VLAN_CTAG_FILTER:

static int vlan_device_event(struct notifier_block *unused, unsigned long event,
			     void *ptr)
{
...
	case NETDEV_DOWN:
		if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
			vlan_vid_del(dev, htons(ETH_P_8021Q), 0);

对于SRIOV设备(至少对于igb驱动是这样),如果通过PF设置了VF的VLAN ID(比如#ip link set eth1 vf 1 vlan $VLAN),就不允许VF直接设置VLAN ID(而必须通过PF进行设置):

//igb/igb_main.c
static void igb_rcv_msg_from_vf(struct igb_adapter *adapter, u32 vf)
{
...
retval = igb_read_mbx(hw, msgbuf, E1000_VFMAILBOX_SIZE, vf);

    switch ((msgbuf[0] & 0xFFFF)) {
...
    case E1000_VF_SET_VLAN:
        retval = -1;
        if (vf_data->pf_vlan) ///如果从PF设置VF
            dev_warn(&pdev->dev,
                 "VF %d attempted to override administratively set VLAN tag\nReload the VF driver to resume operations\n",
                 vf);
        else
            retval = igb_set_vf_vlan(adapter, msgbuf, vf);
        break;
//…
    msgbuf[0] |= E1000_VT_MSGTYPE_CTS;
out:
    /* notify the VF of the results of what it sent us */
    if (retval) ///设置失败
        msgbuf[0] |= E1000_VT_MSGTYPE_NACK;
    else
        msgbuf[0] |= E1000_VT_MSGTYPE_ACK;

    igb_write_mbx(hw, msgbuf, 1, vf);
}

VF设置VLAN ID

//igbvf/vf.c
/**
 *  e1000_set_vfta_vf - Set/Unset vlan filter table address
 *  @hw: pointer to the HW structure
 *  @vid: determines the vfta register and bit to set/unset
 *  @set: if true then set bit, else clear bit
 **/
static s32 e1000_set_vfta_vf(struct e1000_hw *hw, u16 vid, bool set)
{
    struct e1000_mbx_info *mbx = &hw->mbx;
    u32 msgbuf[2];
    s32 err;

    msgbuf[0] = E1000_VF_SET_VLAN;
    msgbuf[1] = vid;
    /* Setting the 8 bit field MSG INFO to true indicates "add" */
    if (set)
        msgbuf[0] |= 1 << E1000_VT_MSGINFO_SHIFT;
    //发送消息给PF
    mbx->ops.write_posted(hw, msgbuf, 2);
    //读取结果
    err = mbx->ops.read_posted(hw, msgbuf, 2);

    msgbuf[0] &= ~E1000_VT_MSGTYPE_CTS;

    /* if nacked the vlan was rejected */
    if (!err && (msgbuf[0] == (E1000_VF_SET_VLAN | E1000_VT_MSGTYPE_NACK)))
        err = -E1000_ERR_MAC_INIT;

    return err;
}

另外,值得一提的是,当VF的net namespace发生变化时,内核也会调用dev_close,也会导致上面的过程发生。

net device与netnamespace

netnamespace cleanup work

net namespace模块在初始化时,会创建一个netns的workqueue内核线程:

static struct workqueue_struct *netns_wq;
static int __init net_ns_init(void)
{
...
    /* Create workqueue for cleanup */
    netns_wq = create_singlethread_workqueue("netns");
...

这个内核线程专门负责net namespace的清除操作。 当内核销毁net namespace时,就会将net namespace加到全局的cleanup_list,然后由netns内核线程负责net namespace的清除工作:

static DEFINE_SPINLOCK(cleanup_list_lock);
static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */

static DECLARE_WORK(net_cleanup_work, cleanup_net);

void __put_net(struct net *net)
{
    /* Cleanup the network namespace in process context */
    unsigned long flags;

    //将net加到cleanup_list
    spin_lock_irqsave(&cleanup_list_lock, flags);
    list_add(&net->cleanup_list, &cleanup_list);
    spin_unlock_irqrestore(&cleanup_list_lock, flags);

    queue_work(netns_wq, &net_cleanup_work);
}

函数cleanup_net主要执行各种注册的pernet_operations->exit操作:

static void cleanup_net(struct work_struct *work)
{
    const struct pernet_operations *ops;
..
    /* Run all of the network namespace exit methods */
    list_for_each_entry_reverse(ops, &pernet_list, list)
        ops_exit_list(ops, &net_exit_list);

pernet_operations

pernet_operations包含init和exit函数,init函数在创建netnamespace时调用,exit在销毁netnamespace时调用。各个涉及到netnamespace的网络模块,比如VXLAN,都有对应的pernet_operations对象,用于指定当netnamespace创建或者销毁时,内核通过调用其init/exit方法,进行初始化或者善后工作。

其中,对网络设备的处理由default_device_exit完成,它主要负责当netnamespace销毁时,将其中的网络设备移到init netnamespace:

//net/core/dev.c
static struct pernet_operations __net_initdata default_device_ops = {
	.exit = default_device_exit,
	.exit_batch = default_device_exit_batch,
};


static void __net_exit default_device_exit(struct net *net)
{
...
    for_each_netdev_safe(net, dev, aux) {
        int err;
        char fb_name[IFNAMSIZ];

        /* Ignore unmoveable devices (i.e. loopback) */
        if (dev->features & NETIF_F_NETNS_LOCAL)
            continue;

        /* Leave virtual devices for the generic cleanup */
        if (dev->rtnl_link_ops)
            continue;
	      //将网络设备移到init netnamespace
        /* Push remaining network devices to init_net */
        snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
        err = dev_change_net_namespace(dev, &init_net, fb_name);
        if (err) {
            pr_emerg("%s: failed to move %s to init_net: %d\n",
                 __func__, dev->name, err);
            BUG();
        }
    }

}

问题

网络容器关闭之后,VF对应的网络设备并没有及时移到init netnamespace:

那么问题来了,

** 是因为内核线程没有及时处理cleanup任务,还是因为内核没有触发销毁netnamespace的逻辑而导致没有创建cleanup work呢? **

跟踪内核函数__put_net的调用,发现其调用时间与VF移动时间一致:

# ./kprobe -s 'p:__put_net'
__put_net
Tracing kprobe __put_net. Ctrl-C to end.
          <idle>-0     [013] dNs. 2611893.167041: __put_net: (__put_net+0x0/0x80)
          <idle>-0     [013] dNs. 2611893.167047: <stack trace>
 => sk_free
 => tcp_write_timer
 => call_timer_fn
 => run_timer_softirq
 => __do_softirq
 => call_softirq
 => do_softirq
 => irq_exit
 => smp_apic_timer_interrupt
 => apic_timer_interrupt
 => arch_cpu_idle
 => cpu_idle_loop
 => cpu_startup_entry
 => start_secondary

从这里可以看出,由于netnamespace还有socket没有释放,从而导致了netnamespace没有及时释放。

socket与netnamespace

netnamespace对象有一个引用计数:

//include/net/net_namespace.h
struct net {
    atomic_t        count;      /* To decided when the network
                         *  namespace should be shut down.
                         */

当我们在netnamespace创建一个socket,就会设置其netnamespace,并对netnamespace引用计数加1:

#ifdef CONFIG_NET_NS

static inline struct net *get_net(struct net *net)
{
    atomic_inc(&net->count);
    return net;
}

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
              struct proto *prot)
{
    sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
    if (sk) {
...
       //设置netnamespace,引用计数加1
        sock_net_set(sk, get_net(net)); 

当释放socket时,就会执行相反的过程:

static void __sk_free(struct sock *sk)
{
...
    put_net(sock_net(sk));
}

static inline void put_net(struct net *net)
{
    if (atomic_dec_and_test(&net->count))
        __put_net(net);
}

进程与netnamespace

当进程结束时,也会尝试删除netnamespace:

总结

当容器异常停掉之后,导致socket没有及时释放(socket中的数据没有确认完),从而引起netnamespace没有及时释放,再引起netnamespace中的VF设备没有回到init netnamespace。


上一篇     下一篇