YY哥 2017-10-17T03:49:58+00:00 hustcat@gmail.com Multiple queue and RSS in DPDK 2017-10-17T11:00:30+00:00 hustcat http://hustcat.github.io/rss-in-dpdk RX queue

rte_eth_dev->data(对应结构体rte_eth_dev_data)保存设备的(接收/发送)队列信息:

struct rte_eth_dev_data {
	char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */

	void **rx_queues; /**< Array of pointers to RX queues. */
	void **tx_queues; /**< Array of pointers to TX queues. */
	uint16_t nb_rx_queues; /**< Number of RX queues. */
	uint16_t nb_tx_queues; /**< Number of TX queues. */
///...

rx_queues为接收队列指针数组,每个指针指向和一个具体的接收队列,以igb驱动(drivers/net/e1000)为例:

/**
 * Structure associated with each RX queue.
 */
struct igb_rx_queue {
	struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
	volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
	uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
	volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
	volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
	struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
	struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
	struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
	uint16_t            nb_rx_desc; /**< number of RX descriptors. */
	uint16_t            rx_tail;    /**< current value of RDT register. */
	uint16_t            nb_rx_hold; /**< number of held free RX desc. */
	uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
	uint16_t            queue_id;   /**< RX queue index. */
	uint16_t            reg_idx;    /**< RX queue register index. */
	uint8_t             port_id;    /**< Device port identifier. */
	uint8_t             pthresh;    /**< Prefetch threshold register. */
	uint8_t             hthresh;    /**< Host threshold register. */
	uint8_t             wthresh;    /**< Write-back threshold register. */
	uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
};

每个队列包含一个硬件描述符ring(rx_ring)和一个软件描述符ring(sw_ring),rx_ring主要由驱动与硬件使用,sw_ring实际上是是一个mbuf指针,主要由DPDK应用程序使用。

  • e1000_adv_rx_desc

硬件描述符,所有的e1000_adv_rx_desc构成一个环形DMA缓冲区。对于接收数据时,pkt_addr指向rte_mbuf->buf_physaddr,从而使得网卡收到数据时,将数据写到mbuf对应的数据缓冲区。

/* Receive Descriptor - Advanced */
union e1000_adv_rx_desc {
	struct {
		__le64 pkt_addr; /* Packet buffer address */
		__le64 hdr_addr; /* Header buffer address */
	} read; ///for receive
	struct {
		struct {
			union {
				__le32 data;
				struct {
					__le16 pkt_info; /*RSS type, Pkt type*/
					/* Split Header, header buffer len */
					__le16 hdr_info;
				} hs_rss;
			} lo_dword;
			union {
				__le32 rss; /* RSS Hash */
				struct {
					__le16 ip_id; /* IP id */
					__le16 csum; /* Packet Checksum */
				} csum_ip;
			} hi_dword;
		} lower;
		struct {
			__le32 status_error; /* ext status/error */
			__le16 length; /* Packet length */
			__le16 vlan; /* VLAN tag */
		} upper;
	} wb;  /* writeback */
};
  • igb_rx_entry

每个硬件描述符都有一个对应的软件描述符,它是DPDK应用程序与DPDK驱动之间进行数据传递的桥梁,它实际上是一个rte_mbuf的指针,rte_mbuf->buf_physaddr为DMA的物理地址,由网卡硬件使用,rte_mbuf->buf_addrbuffer的虚拟地址,由DPDK应用程序使用。

/**
 * Structure associated with each descriptor of the RX ring of a RX queue.
 */
struct igb_rx_entry {
	struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
};

/**
 * The generic rte_mbuf, containing a packet mbuf.
 */
struct rte_mbuf {
	MARKER cacheline0;

	void *buf_addr;           /**< Virtual address of segment buffer. */
	/**
	 * Physical address of segment buffer.
	 * Force alignment to 8-bytes, so as to ensure we have the exact
	 * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
	 * working on vector drivers easier.
	 */
	phys_addr_t buf_physaddr __rte_aligned(sizeof(phys_addr_t));
///...

Config queue

DPDK应用程序可以调用rte_eth_dev_configure设置Port的队列数量:

		ret = rte_eth_dev_configure(portid, nb_rx_queue,
					(uint16_t)n_tx_queue, &port_conf);

rte_eth_dev_configure会调用rte_eth_dev_rx_queue_configrte_eth_dev_tx_queue_config设置接收队列和发送队列:

rte_eth_dev_configure
|---rte_eth_dev_rx_queue_config
|---rte_eth_dev_tx_queue_config
  • config rx queue
static int
rte_eth_dev_rx_queue_config(struct rte_eth_dev *dev, uint16_t nb_queues)
{
	uint16_t old_nb_queues = dev->data->nb_rx_queues;
	void **rxq;
	unsigned i;

	if (dev->data->rx_queues == NULL && nb_queues != 0) { /* first time configuration */
		dev->data->rx_queues = rte_zmalloc("ethdev->rx_queues",
				sizeof(dev->data->rx_queues[0]) * nb_queues,
				RTE_CACHE_LINE_SIZE);
		if (dev->data->rx_queues == NULL) {
			dev->data->nb_rx_queues = 0;
			return -(ENOMEM);
		}
	}
///...

Setup queue

  • rte_eth_rx_queue_setup

DPDK application都会调用rte_eth_rx_queue_setup初始化接收队列。

int
rte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,
		       uint16_t nb_rx_desc, unsigned int socket_id,
		       const struct rte_eth_rxconf *rx_conf,
		       struct rte_mempool *mp)
{
///...
	ret = (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
					      socket_id, rx_conf, mp); ///eth_igb_ops, eth_igb_rx_queue_setup
}

eth_igb_rx_queue_setup会创建接收队列igb_rx_queue,分配RX ring hardware descriptors(e1000_adv_rx_desc)software ring(igb_rx_entry):

int
eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
			 uint16_t queue_idx,
			 uint16_t nb_desc,
			 unsigned int socket_id,
			 const struct rte_eth_rxconf *rx_conf,
			 struct rte_mempool *mp)
{
	const struct rte_memzone *rz;
	struct igb_rx_queue *rxq;
	struct e1000_hw     *hw;
	unsigned int size;

	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
///...
	/* First allocate the RX queue data structure. */
	rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
			  RTE_CACHE_LINE_SIZE);
///...
	/*
	 *  Allocate RX ring hardware descriptors. A memzone large enough to
	 *  handle the maximum ring size is allocated in order to allow for
	 *  resizing in later calls to the queue setup function.
	 */
	size = sizeof(union e1000_adv_rx_desc) * E1000_MAX_RING_DESC;
	rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
				      E1000_ALIGN, socket_id);
///...
	rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
	rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
	rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
	rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;

	/* Allocate software ring. */
	rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
				   sizeof(struct igb_rx_entry) * nb_desc,
				   RTE_CACHE_LINE_SIZE);
}

eth_igb_rx_queue_setup主要完成DMA描述符环形队列的初始化。

RSS

  • Configure RSS with DPDK

通过rx_mode.mq_mode = ETH_MQ_RX_RSSrte_eth_dev_configure)可以开启Port的RSS,以l3fwd为例:

static struct rte_eth_conf port_conf = {
	.rxmode = {
		.mq_mode = ETH_MQ_RX_RSS,
		.max_rx_pkt_len = ETHER_MAX_LEN,
		.split_hdr_size = 0,
		.header_split   = 0, /**< Header Split disabled */
		.hw_ip_checksum = 1, /**< IP checksum offload enabled */
		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
	},
	.rx_adv_conf = {
		.rss_conf = {
			.rss_key = NULL,
			.rss_hf = ETH_RSS_IP,
		},
	},
	.txmode = {
		.mq_mode = ETH_MQ_TX_NONE,
	},
};
  • Driver(igb) config RSS

eth_igb_start -> eth_igb_rx_init -> igb_dev_mq_rx_configure

//drivers/net/e1000/igb_rxtx.c
static int
igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
{
	struct e1000_hw *hw =
		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
	uint32_t mrqc;

	if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
		/*
		 * SRIOV active scheme
		 * FIXME if support RSS together with VMDq & SRIOV
		 */
		mrqc = E1000_MRQC_ENABLE_VMDQ;
		/* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
		mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
		E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
	} else if(RTE_ETH_DEV_SRIOV(dev).active == 0) { ///disable SRIOV
		/*
		 * SRIOV inactive scheme
		 */
		switch (dev->data->dev_conf.rxmode.mq_mode) {
			case ETH_MQ_RX_RSS:
				igb_rss_configure(dev); ///RSS
				break;
///...
}

static void
igb_rss_configure(struct rte_eth_dev *dev)
{
///...
	if (rss_conf.rss_key == NULL)
		rss_conf.rss_key = rss_intel_key; /* Default hash key */
	igb_hw_rss_hash_set(hw, &rss_conf);
}

Refs

]]>
KNI in DPDK 2017-10-11T23:00:30+00:00 hustcat http://hustcat.github.io/kni-in-dpdk 介绍

The Kernel NIC Interface (KNI) is a DPDK control plane solution that allows userspace applications to exchange packets with the kernel networking stack. To accomplish this, DPDK userspace applications use an IOCTL call to request the creation of a KNI virtual device in the Linux* kernel. The IOCTL call provides interface information and the DPDK’s physical address space, which is re-mapped into the kernel address space by the KNI kernel loadable module that saves the information to a virtual device context. The DPDK creates FIFO queues for packet ingress and egress to the kernel module for each device allocated.

The KNI kernel loadable module is a standard net driver, which upon receiving the IOCTL call access the DPDK’s FIFO queue to receive/transmit packets from/to the DPDK userspace application. The FIFO queues contain pointers to data packets in the DPDK. This:

  • Provides a faster mechanism to interface with the kernel net stack and eliminates system calls

  • Facilitates the DPDK using standard Linux* userspace net tools (tcpdump, ftp, and so on)

  • Eliminate the copy_to_user and copy_from_user operations on packets.

测试

Load KNI kernel module:

# insmod /root/dpdk/x86/lib/modules/3.10.0-514.el7.x86_64/extra/dpdk/rte_kni.ko

Build KNI application:

# export RTE_SDK=/root/dpdk/x86/share/dpdk
# cd examples/kni
# make
  CC main.o
  LD kni
  INSTALL-APP kni
  INSTALL-MAP kni.map

Run KNI application:

# build/kni -c 0x0f -n 2 -- -P -p 0x3 --config="(0,0,1),(1,2,3)" 
EAL: Detected 4 lcore(s)
EAL: No free hugepages reported in hugepages-1048576kB
EAL: Probing VFIO support...
EAL: WARNING: cpu flags constant_tsc=yes nonstop_tsc=no -> using unreliable clock cycles !
EAL: PCI device 0000:00:05.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:06.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:07.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
APP: Initialising port 0 ...
KNI: pci: 00:06:00       8086:100e
APP: Initialising port 1 ...
KNI: pci: 00:07:00       8086:100e

Checking link status
.....done
Port 0 Link Up - speed 1000 Mbps - full-duplex
Port 1 Link Up - speed 1000 Mbps - full-duplex
APP: Lcore 1 is writing to port 0
APP: Lcore 2 is reading from port 1
APP: Lcore 0 is reading from port 0
APP: Lcore 3 is writing to port 1
...

其中,

  • -c = core bitmask

  • -P = promiscuous mode

  • -p = port hex bitmask

  • –config=”(port, lcore_rx, lcore_tx [,lcore_kthread, …]) …”

Note that each core can do either TX or RX for one port only.

[root@vm01 ~]# ip a
...
7: vEth0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
    link/ether ba:92:66:e5:2f:35 brd ff:ff:ff:ff:ff:ff
8: vEth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
    link/ether b2:64:67:2f:32:4a brd ff:ff:ff:ff:ff:ff
[root@vm01 ~]# ip addr add 10.0.10.30/24 dev vEth0
[root@vm01 ~]# ip link set vEth0 up
[root@vm03 ~]# ping -c 3 10.0.10.30 
PING 10.0.10.30 (10.0.10.30) 56(84) bytes of data.
64 bytes from 10.0.10.30: icmp_seq=1 ttl=64 time=14.2 ms
64 bytes from 10.0.10.30: icmp_seq=2 ttl=64 time=2.96 ms
64 bytes from 10.0.10.30: icmp_seq=3 ttl=64 time=1.89 ms

给kni应用进程发送SIGUSR1信号,kni应用进程会输出统计信息:

[root@vm01 ~]# pkill -10 kni

...
**KNI example application statistics**
======  ==============  ============  ============  ============  ============
 Port    Lcore(RX/TX)    rx_packets    rx_dropped    tx_packets    tx_dropped
------  --------------  ------------  ------------  ------------  ------------
      0          0/ 1            23             0             5             0
      1          2/ 3             1             0             0             0
======  ==============  ============  ============  ============  ============

实现

相关代码:

KNI示例程序位于example/kni,KNI内核模块位于lib/librte_eal/linuxapp/kni,KNI library位于lib/librte_kni

整体实现如下:

数据接收

先调用rte_eth_rx_burst从网络接口读取数据,然后调用rte_kni_tx_burst通过FIFO传给内核模块。

		/* Burst rx from eth */
		nb_rx = rte_eth_rx_burst(port_id, 0, pkts_burst, PKT_BURST_SZ);

		/* Burst tx to kni */
		num = rte_kni_tx_burst(p->kni[i], pkts_burst, nb_rx);

rte_kni_tx_burst:

///librte_kni
unsigned
rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
{
	void *phy_mbufs[num];
	unsigned int ret;
	unsigned int i;

	for (i = 0; i < num; i++)
		phy_mbufs[i] = va2pa(mbufs[i]);

	ret = kni_fifo_put(kni->rx_q, phy_mbufs, num);

	/* Get mbufs from free_q and then free them */
	kni_free_mbufs(kni);

	return ret;
}

/**
 * Adds num elements into the fifo. Return the number actually written
 */
static inline unsigned
kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
{
	unsigned i = 0;
	unsigned fifo_write = fifo->write;
	unsigned fifo_read = fifo->read;
	unsigned new_write = fifo_write;

	for (i = 0; i < num; i++) {
		new_write = (new_write + 1) & (fifo->len - 1);

		if (new_write == fifo_read)
			break;
		fifo->buffer[fifo_write] = data[i];
		fifo_write = new_write;
	}
	fifo->write = fifo_write;
	return i;
}
  • fifo

DPDK应用通过fifo与内核模块交换数据,fifo实际上是一块环形共享内存:

/*
 * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
 * Write and read should wrap around. Fifo is empty when write == read
 * Writing should never overwrite the read position
 */
struct rte_kni_fifo {
	volatile unsigned write;     /**< Next position to be written*/
	volatile unsigned read;      /**< Next position to be read */
	unsigned len;                /**< Circular buffer length */
	unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
	void *volatile buffer[];     /**< The buffer contains mbuf pointers */
};

DPDK应用程序在初始化时,需要将fifo共享内存的地址告诉KNI内核模块:

struct rte_kni *
rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
	      const struct rte_kni_conf *conf,
	      struct rte_kni_ops *ops)
{
///...
	/* TX RING */
	mz = slot->m_tx_q;
	ctx->tx_q = mz->addr;
	kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
	dev_info.tx_phys = mz->phys_addr;

	/* RX RING */
	mz = slot->m_rx_q;
	ctx->rx_q = mz->addr;
	kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
	dev_info.rx_phys = mz->phys_addr;
///...
	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info); ///内核模块
    
}

KNI kernel module

static int
kni_ioctl(struct inode *inode, uint32_t ioctl_num, unsigned long ioctl_param)
{
///..
	case _IOC_NR(RTE_KNI_IOCTL_CREATE):
		ret = kni_ioctl_create(net, ioctl_num, ioctl_param);

kni_ioctl_create会创建对应的网络设备vEthX,然后设置对应的fifo共享内存,并启动对应的内核线程:

static int
kni_ioctl_create(struct net *net, uint32_t ioctl_num,
		unsigned long ioctl_param)
{
///...
	net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name,
#ifdef NET_NAME_USER
							NET_NAME_USER,
#endif
							kni_net_init);
///...
	/* Translate user space info into kernel space info */
	kni->tx_q = phys_to_virt(dev_info.tx_phys);
	kni->rx_q = phys_to_virt(dev_info.rx_phys);
	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
	kni->free_q = phys_to_virt(dev_info.free_phys);
///...
	ret = kni_run_thread(knet, kni, dev_info.force_bind);
///...
}

kernel thread:

KNI内核线程不断从fifo共享内存读取数据,然后交给内核协议栈继续处理:

static int
kni_thread_single(void *data)
{
	struct kni_net *knet = data;
	int j;
	struct kni_dev *dev;

	while (!kthread_should_stop()) {
		down_read(&knet->kni_list_lock);
		for (j = 0; j < KNI_RX_LOOP_NUM; j++) {
			list_for_each_entry(dev, &knet->kni_list_head, list) {
				kni_net_rx(dev);
				kni_net_poll_resp(dev);
			}
		}
		up_read(&knet->kni_list_lock);
///...
}

/* rx interface */
void
kni_net_rx(struct kni_dev *kni)
{
	/**
	 * It doesn't need to check if it is NULL pointer,
	 * as it has a default value
	 */
	(*kni_net_rx_func)(kni); ///kni_net_rx_normal
}
  • kni_net_rx_func

kni_net_rx_funcfifo共享内存读取数据,然后分配skb,拷贝数据,调用netif_rx_ni进入内核协议栈:

static void kni_net_rx_normal(struct kni_dev *kni);

/* kni rx function pointer, with default to normal rx */
static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
/*
 * RX: normal working mode
 */
static void
kni_net_rx_normal(struct kni_dev *kni)
{
///...
	/* Calculate the number of entries to dequeue from rx_q */
	num_rx = min_t(uint32_t, num_fq, MBUF_BURST_SZ);

	/* Burst dequeue from rx_q */
	num_rx = kni_fifo_get(kni->rx_q, kni->pa, num_rx);
	if (num_rx == 0)
		return;
///...
	/* Transfer received packets to netif */
	for (i = 0; i < num_rx; i++) {
		kva = pa2kva(kni->pa[i]);
		len = kva->pkt_len;
		data_kva = kva2data_kva(kva);
		kni->va[i] = pa2va(kni->pa[i], kva);

		skb = dev_alloc_skb(len + 2);
///...
		skb->dev = dev;
		skb->protocol = eth_type_trans(skb, dev);
		skb->ip_summed = CHECKSUM_UNNECESSARY;

		/* Call netif interface */
		netif_rx_ni(skb); ///进入内核协议栈
///...
	}
///...
}

发送数据

  • KNI kernel interface
/*
 * Transmit a packet (called by the kernel)
 */
static int
kni_net_tx(struct sk_buff *skb, struct net_device *dev)
{
///...
	/* dequeue a mbuf from alloc_q */
	ret = kni_fifo_get(kni->alloc_q, &pkt_pa, 1);
	if (likely(ret == 1)) {
		void *data_kva;

		pkt_kva = pa2kva(pkt_pa);
		data_kva = kva2data_kva(pkt_kva);
		pkt_va = pa2va(pkt_pa, pkt_kva);

		len = skb->len; /// data length
		memcpy(data_kva, skb->data, len); /// copy data
		if (unlikely(len < ETH_ZLEN)) {
			memset(data_kva + len, 0, ETH_ZLEN - len);
			len = ETH_ZLEN;
		}
		pkt_kva->pkt_len = len;
		pkt_kva->data_len = len;

		/* enqueue mbuf into tx_q */
		ret = kni_fifo_put(kni->tx_q, &pkt_va, 1);/// put tx_q
///...
  • DPDK app
		/* Burst rx from kni */
		num = rte_kni_rx_burst(p->kni[i], pkts_burst, PKT_BURST_SZ);

		/* Burst tx to eth */
		nb_tx = rte_eth_tx_burst(port_id, 0, pkts_burst, (uint16_t)num);

rte_kni_rx_bursttx_q队列取出mbuf:

unsigned
rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
{
	unsigned ret = kni_fifo_get(kni->tx_q, (void **)mbufs, num);

	/* If buffers removed, allocate mbufs and then put them into alloc_q */
	if (ret)
		kni_allocate_mbufs(kni);

	return ret;
}

Refs

]]>
Introduction to the UIO 2017-10-10T23:00:30+00:00 hustcat http://hustcat.github.io/introduction-to-uio UIO

每个UIO设备可以通过设备文件(/dev/uioX)和sysfs的属性文件来访问。

可以通过mmap映射/dev/uioX来访问UIO设备的寄存器或者RAM。

直接read /dev/uioX来获取UIO设备的中断,read()会被阻塞,发生中断时就会返回。

Each UIO device is accessed through a device file and several sysfs attribute files. The device file will be called /dev/uio0 for the first device, and /dev/uio1, /dev/uio2 and so on for subsequent devices.

/dev/uioX is used to access the address space of the card. Just use mmap() to access registers or RAM locations of your card.

Interrupts are handled by reading from /dev/uioX. A blocking read() from /dev/uioX will return as soon as an interrupt occurs. You can also use select() on /dev/uioX to wait for an interrupt. The integer value read from /dev/uioX represents the total interrupt count. You can use this number to figure out if you missed some interrupts.

uio driver

uio_pci_generic

UIO设备需要UIO内核驱动的支持,uio_pci_generic是一个通用的PCI UIO设备的内核驱动。

UIO does not completely eliminate the need for kernel-space code. A small module is required to set up the device, perhaps interface to the PCI bus, and register an interrupt handler. The last function (interrupt handling) is particularly important; much can be done in user space, but there needs to be an in-kernel interrupt handler which knows how to tell the device to stop crying for attention.

///drivers/uio/uio_pci_generic.c
static struct pci_driver driver = {
	.name = "uio_pci_generic",
	.id_table = NULL, /* only dynamic id's */
	.probe = probe,
	.remove = remove,
};

interrupt

uio_register_device 注册UIO驱动时时,会注册中断处理函数uio_interrupt:

/**
 * uio_register_device - register a new userspace IO device
 * @owner:	module that creates the new device
 * @parent:	parent device
 * @info:	UIO device capabilities
 *
 * returns zero on success or a negative error code.
 */
int __uio_register_device(struct module *owner,
			  struct device *parent,
			  struct uio_info *info)
{
///...
	if (info->irq && (info->irq != UIO_IRQ_CUSTOM)) {
		ret = request_irq(info->irq, uio_interrupt,
				  info->irq_flags, info->name, idev);
		if (ret)
			goto err_request_irq;
	}
}

/**
 * uio_interrupt - hardware interrupt handler
 * @irq: IRQ number, can be UIO_IRQ_CYCLIC for cyclic timer
 * @dev_id: Pointer to the devices uio_device structure
 */
static irqreturn_t uio_interrupt(int irq, void *dev_id)
{
	struct uio_device *idev = (struct uio_device *)dev_id;
	irqreturn_t ret = idev->info->handler(irq, idev->info); ///irqhandler

	if (ret == IRQ_HANDLED)
		uio_event_notify(idev->info); ///notify userspace

	return ret;
}
  • uio_event_notify

UIO设备每次发生中断时,都会增加uio_device->event,然后唤醒等待进程

/** notify userspace application
 * uio_event_notify - trigger an interrupt event
 * @info: UIO device capabilities
 */
void uio_event_notify(struct uio_info *info)
{
	struct uio_device *idev = info->uio_dev;

	atomic_inc(&idev->event);
	wake_up_interruptible(&idev->wait); ///wake up waiting process
	kill_fasync(&idev->async_queue, SIGIO, POLL_IN);
}
  • read /dev/uioX

每次调用open /dev/uioX时,内核都会创建一个uio_listener,关联到struct file->private_data:

static int uio_open(struct inode *inode, struct file *filep)
{
	struct uio_device *idev;
	struct uio_listener *listener;
	int ret = 0;

	mutex_lock(&minor_lock);
	idev = idr_find(&uio_idr, iminor(inode));
	mutex_unlock(&minor_lock);
///...
	listener = kmalloc(sizeof(*listener), GFP_KERNEL);
	if (!listener) {
		ret = -ENOMEM;
		goto err_alloc_listener;
	}

	listener->dev = idev;
	listener->event_count = atomic_read(&idev->event);
	filep->private_data = listener;

然后每次read()时,发现uio_listener->event_countuio_device->event不相同时,就意味着发生中断,会返回对应的值;否则,将当前进程加入到uio_device->wait队列,然后挂起当前进程:

static ssize_t uio_read(struct file *filep, char __user *buf,
			size_t count, loff_t *ppos)
{
	struct uio_listener *listener = filep->private_data;
	struct uio_device *idev = listener->dev;
	DECLARE_WAITQUEUE(wait, current);
///...
	add_wait_queue(&idev->wait, &wait);

	do {
		set_current_state(TASK_INTERRUPTIBLE);

		event_count = atomic_read(&idev->event);
		if (event_count != listener->event_count) { ///irq happened
			if (copy_to_user(buf, &event_count, count))
				retval = -EFAULT;
			else {
				listener->event_count = event_count;
				retval = count;
			}
			break;
		}

		if (filep->f_flags & O_NONBLOCK) {
			retval = -EAGAIN;
			break;
		}

		if (signal_pending(current)) {
			retval = -ERESTARTSYS;
			break;
		}
		schedule();
	} while (1);

	__set_current_state(TASK_RUNNING);
	remove_wait_queue(&idev->wait, &wait);

	return retval;
}
  • uio_mmap

当用户态driver调用mmap映射/dev/uioX时,内核会调用uio_mmap映射UIO设备的RAM:

static int uio_mmap(struct file *filep, struct vm_area_struct *vma)
{
	struct uio_listener *listener = filep->private_data;
	struct uio_device *idev = listener->dev;
	int mi;
	unsigned long requested_pages, actual_pages;
	int ret = 0;

	if (vma->vm_end < vma->vm_start)
		return -EINVAL;

	vma->vm_private_data = idev;

	mi = uio_find_mem_index(vma);
	if (mi < 0)
		return -EINVAL;

	requested_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
	actual_pages = ((idev->info->mem[mi].addr & ~PAGE_MASK)
			+ idev->info->mem[mi].size + PAGE_SIZE -1) >> PAGE_SHIFT;
	if (requested_pages > actual_pages)
		return -EINVAL;

	if (idev->info->mmap) {
		ret = idev->info->mmap(idev->info, vma);
		return ret;
	}

	switch (idev->info->mem[mi].memtype) {
		case UIO_MEM_PHYS:
			return uio_mmap_physical(vma);
		case UIO_MEM_LOGICAL:
		case UIO_MEM_VIRTUAL:
			return uio_mmap_logical(vma);
		default:
			return -EINVAL;
	}
}

更多关于mmap参考Linux MMAP & Ioremap introduction.

userspace driver

fd = open(/dev/uio0, O_RDWR|O_SYNC);
/* Map device's registers into user memory */
/* fitting the memory area on pages */
offset = addr & ~PAGE_MASK;
addr = 0 /* region 0 */ * PAGE_SIZE;
size = (size + PAGE_SIZE ­ 1) / PAGE_SIZE * PAGE_SIZE;
iomem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, 
fd, addr);
iomem += offset;
/* Stop the counting */
*(u_char *)SH_TMU_TSTR(iomem) |= ~(TSTR_TSTR2);
...
/* Wait for an interrupt */;
read(fd, &n_pending, sizeof(u_long));
val = *(u_int *)SH_TMU2_TCNT(iomem);
...
/* Stop the TMU */
*(u_char *)SH_TMU_TSTR(iomem) &= ~(TSTR_TSTR2);
munmap(iomem, size);
close(fd);

详细参考 Using UIO in an embedded platform.

参考

]]>
Checksum in Linux Kernel 2017-08-15T17:00:30+00:00 hustcat http://hustcat.github.io/checksum-in-kernel calculate IP/TCP/UDP checsum

内核计算IP/TCPU/UDP的校验和的方法,参考How to Calculate IP/TCP/UDP Checksum–Part 1 Theory.

简单来说,就是对要计算的数据,以16bit为单元进行累加,然后取反。

TCP收包时,检查校验和:

static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
{
	const struct iphdr *iph = ip_hdr(skb);

	if (skb->ip_summed == CHECKSUM_COMPLETE) {
		if (!tcp_v4_check(skb->len, iph->saddr, ///check TCP/UDP pseudo-header checksum
				  iph->daddr, skb->csum)) {
			skb->ip_summed = CHECKSUM_UNNECESSARY;
			return 0;
		}
	}

	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
				       skb->len, IPPROTO_TCP, 0); ///calc pseudo header checksum

	if (skb->len <= 76) {
		return __skb_checksum_complete(skb); /// 基于伪头累加和,计算整个数据包的checksum
	}
	return 0;
}

csum_tcpudp_nofold用于计算伪头的checksum,__skb_checksum_complete基于伪头累加和(skb->csum)计算整个skb的校验和。

net_device->features

net_device->features字段表示设备的各种特性。其中一些位用于表示硬件校验和的计算能力:

#define NETIF_F_IP_CSUM		__NETIF_F(HW_CSUM)
#define NETIF_F_IP_CSUM		__NETIF_F(IP_CSUM) ///ipv4 + TCP/UDP
#define NETIF_F_IPV6_CSUM	__NETIF_F(IPV6_CSUM)

NETIF_F_IP_CSUM表示硬件可以计算L4 checksum,但是只针对IPV4的TCP和UDP。但是一些设备扩展支持VXLAN和NVGRE。 NETIF_F_IP_CSUM是一种协议感知的计算checksum的方法。具体来说,上层提供两个CSUM的参数(csum_startcsum_offset)。

NETIF_F_HW_CSUM is a protocol agnostic method to offload the transmit checksum. In this method the host provides checksum related parameters in a transmit descriptor for a packet. These parameters include the starting offset of data to checksum and the offset in the packet where the computed checksum is to be written. The length of data to checksum is implicitly the length of the packet minus the starting offset.

值得一提的是,igb/ixgbe使用的NETIF_F_IP_CSUM.

sk_buff

取决于skb是接收封包,还是发送封包,skb->csumskb->ip_summed的意义会不同。

/*
 *	@csum: Checksum (must include start/offset pair)
 *	@csum_start: Offset from skb->head where checksumming should start
 *	@csum_offset: Offset from csum_start where checksum should be stored
 *	@ip_summed: Driver fed us an IP checksum
 */
struct sk_buff {
	union {
		__wsum		csum;
		struct {
			__u16	csum_start;
			__u16	csum_offset;
		};
	};

	__u8			local_df:1,
				cloned:1,
				ip_summed:2,
				nohdr:1,
				nfctinfo:3;

skb->ip_summed一般的取值:

/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE 0
#define CHECKSUM_UNNECESSARY 1 ///hardware verified the checksums
#define CHECKSUM_COMPLETE 2
#define CHECKSUM_PARTIAL 3 ///only compute IP header, not include data

接收时的CSUM

对于接收包,skb->csum可能包含L4校验和。skb->ip_summed表述L4校验和的状态:

  • (1) CHECKSUM_UNNECESSARY

CHECKSUM_UNNECESSARY表示底层硬件已经计算了CSUM,以igb驱动为例:

igb_poll -> igb_clean_rx_irq -> igb_process_skb_fields -> igb_rx_checksum:

static inline void igb_rx_checksum(struct igb_ring *ring,
				   union e1000_adv_rx_desc *rx_desc,
				   struct sk_buff *skb)
{
///...
	/* Rx checksum disabled via ethtool */
	if (!(ring->netdev->features & NETIF_F_RXCSUM)) ///关闭RXCSUM
		return;

	/* TCP/UDP checksum error bit is set */
	if (igb_test_staterr(rx_desc,
			     E1000_RXDEXT_STATERR_TCPE |
			     E1000_RXDEXT_STATERR_IPE)) {
		/* work around errata with sctp packets where the TCPE aka
		 * L4E bit is set incorrectly on 64 byte (60 byte w/o crc)
		 * packets, (aka let the stack check the crc32c)
		 */
		if (!((skb->len == 60) &&
		      test_bit(IGB_RING_FLAG_RX_SCTP_CSUM, &ring->flags))) {
			u64_stats_update_begin(&ring->rx_syncp);
			ring->rx_stats.csum_err++;
			u64_stats_update_end(&ring->rx_syncp);
		}
		/* let the stack verify checksum errors,交给协议栈进一步验证csum */
		return;
	}
	/* It must be a TCP or UDP packet with a valid checksum */
	if (igb_test_staterr(rx_desc, E1000_RXD_STAT_TCPCS |
				      E1000_RXD_STAT_UDPCS))
		skb->ip_summed = CHECKSUM_UNNECESSARY; ///stack don't needed verify
}

TCP层在收到包后,发现skb->ip_summedCHECKSUM_UNNECESSARY就不会再检查checksum了:

int tcp_v4_rcv(struct sk_buff *skb)
{
///...
	/* An explanation is required here, I think.
	 * Packet length and doff are validated by header prediction,
	 * provided case of th->doff==0 is eliminated.
	 * So, we defer the checks. */
	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
		goto csum_error;
///...
}

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
	return skb->ip_summed & CHECKSUM_UNNECESSARY;
}
  • (2) CHECKSUM_NONE

csum中的校验和无效,可能有以下几种原因:

  • 设备不支持硬件校验和计算;

  • 设备计算了硬件校验和,但发现该数据帧已经损坏。此时,设备驱动程序可以直接丢弃该数据帧。但有些设备驱动程序(比如e10000/igb/ixbge)却没有丢弃数据帧,而是将ip_summed设置为CHECKSUM_NONE,然后交给上层协议栈重新计算并处理这种错误。

  • (3) CHECKSUM_COMPLETE

表明网卡已经计算了L4层报头和payload的校验和,并且skb->csum已经被赋值,此时L4层的接收者只需要加伪头并验证校验结果。以TCP为例:

static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
{
	const struct iphdr *iph = ip_hdr(skb);

	if (skb->ip_summed == CHECKSUM_COMPLETE) {
		if (!tcp_v4_check(skb->len, iph->saddr, ///check TCP/UDP pseudo-header checksum
				  iph->daddr, skb->csum)) {
			skb->ip_summed = CHECKSUM_UNNECESSARY;
			return 0;
		}
	}
///...
}

值得一提的,igb/ixgbe没有使用CHECKSUM_COMPLETE,而是使用的CHECKSUM_UNNECESSARY.

注意CHECKSUM_COMPLETECHECKSUM_UNNECESSARY的区别,对于前者,上层还需要计算伪头校验和,再进行验证,见tcp_v4_check。实际上,早前的内核版本为CHECKSUM_HW

  • Veth的BUG

这里讨论一个有意思的问题:Linux kernel bug delivers corrupt TCP/IP data to Mesos, Kubernetes, Docker containers.

Veth设备会将CHECKSUM_NONE改为CHECKSUM_UNNECESSARY。这样,就会导致硬件收到损坏的数据帧后,转给veth后,却变成了CHECKSUM_UNNECESSARY,上层协议栈(TCP)就不会再计算检查数据包的校验和了。

static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
///...

	/* don't change ip_summed == CHECKSUM_PARTIAL, as that
	 * will cause bad checksum on forwarded packets
	 */
	if (skb->ip_summed == CHECKSUM_NONE &&
	    rcv->features & NETIF_F_RXCSUM)
		skb->ip_summed = CHECKSUM_UNNECESSARY;
}

veth最初是用于本地通信的设备,一般来说,本地的数据帧不太可能发生损坏。在发送数据时,如果协议栈已经计算校验和,会将skb->ip_summed设置为CHECKSUM_NONE。所以,对于veth本机通信,接收端没有必要再计算校验和。但是,对于容器虚拟化场景,veth的数据包可能来自网络,如果还这样设置,就会导致损坏的数据帧传给应用层。

发送时CSUM

同样,对于发送包,skb->ip_summed用于L4校验和的状态,以通知底层网卡是否还需要处理校验和:

  • (1) CHECKSUM_NONE

此时,CHECKSUM_NONE表示协议栈已经计算了校验和,设备不需要做任何事情。

  • (2) CHECKSUM_PARTIAL

CHECKSUM_PARTIAL表示使用硬件checksum ,协议栈已经计算L4层的伪头的校验和,并且已经加入uh->check字段中,此时只需要设备计算整个头4层头的校验值。

int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
		size_t size)
{
///...

				/*
				 * Check whether we can use HW checksum.
				 */
				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
					skb->ip_summed = CHECKSUM_PARTIAL;
}


static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
///...
	icsk->icsk_af_ops->send_check(sk, skb); ///tcp_v4_send_check
}


static void __tcp_v4_send_check(struct sk_buff *skb,
				__be32 saddr, __be32 daddr)
{
	struct tcphdr *th = tcp_hdr(skb);

	if (skb->ip_summed == CHECKSUM_PARTIAL) { ///HW CSUM
		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); ///add IPv4 pseudo header checksum
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct tcphdr, check);
	} else {
		th->check = tcp_v4_check(skb->len, saddr, daddr,
					 csum_partial(th,
						      th->doff << 2,
						      skb->csum)); ///ip_summed == CHECKSUM_NONE
	}
}

/* This routine computes an IPv4 TCP checksum. */
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
	const struct inet_sock *inet = inet_sk(sk);

	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
  • dev_queue_xmit

最后在dev_queue_xmit发送的时候发现设备不支持硬件checksum还会进行软件计算(是否会走这里?):

int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
			struct netdev_queue *txq)
{
///...
		if (netif_needs_gso(skb, features)) {
			if (unlikely(dev_gso_segment(skb, features))) ///GSO(software offload)
				goto out_kfree_skb;
			if (skb->next)
				goto gso;
		} else { ///hardware offload
			if (skb_needs_linearize(skb, features) &&
			    __skb_linearize(skb))
				goto out_kfree_skb;

			/* If packet is not checksummed and device does not
			 * support checksumming for this protocol, complete
			 * checksumming here.
			 */
			if (skb->ip_summed == CHECKSUM_PARTIAL) { ///only header csum is computed
				if (skb->encapsulation)
					skb_set_inner_transport_header(skb,
						skb_checksum_start_offset(skb));
				else
					skb_set_transport_header(skb,
						skb_checksum_start_offset(skb));
				if (!(features & NETIF_F_ALL_CSUM) && ///check hardware if support offload
				     skb_checksum_help(skb)) ///HW not support CSUM
					goto out_kfree_skb;
			}
		}
}

ip_summed==CHECKSUM_PARTIAL表示协议栈并没有计算完校验和,只计算了伪头,将传输层的数据部分留给了硬件进行计算。如果底层硬件不支持CSUM,则skb_checksum_help完成计算校验和。

Remote checksum

TODO:

相关资料

]]>
Mount namespace and mount propagation 2017-03-10T15:00:30+00:00 hustcat http://hustcat.github.io/mount-namespace-and-mount-propagation Mount namespace and problems

When a new mount namespace is created, it receives a copy of the mount point list replicated from the namespace of the caller of clone() or unshare().

create_new_namespaces -> copy_mnt_ns -> dup_mnt_ns:

/*
 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
 */
static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
		struct fs_struct *fs)
{
	struct mnt_namespace *new_ns;
	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
	struct vfsmount *p, *q;

	new_ns = alloc_mnt_ns();
	if (IS_ERR(new_ns))
		return new_ns;

	down_write(&namespace_sem);
	/* First pass: copy the tree topology */
	new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root,
					CL_COPY_ALL | CL_EXPIRE); ///拷贝原来namespace所有的文件系统
 ///...

每个Mount namespace有自己独立的文件系统视图,但是这种隔离性同时也带来一些问题:比如,当系统加载一块新的磁盘时,在最初的实现中,每个namespace必须单独挂载磁盘,才能见到。很多时候,我们希望挂载一次,就能在所有的mount namespace可见。为此,内核在2.6.15引入了shared subtrees feature

The key benefit of shared subtrees is to allow automatic, controlled propagation of mount and unmount events between namespaces. This means, for example, that mounting an optical disk in one mount namespace can trigger a mount of that disk in all other namespaces.

为了支持shared subtrees feature,每个挂载点都会标记propagation type,用于决定在当前挂载点下创建/删除(子)挂载点时,是否传播到别的挂载点。

propagation type

内核有以下几种传播类型:

  • MS_SHARED

This mount point shares mount and unmount events with other mount points that are members of its “peer group”. When a mount point is added or removed under this mount point, this change will propagate to the peer group, so that the mount or unmount will also take place under each of the peer mount points. Propagation also occurs in the reverse direction, so that mount and unmount events on a peer mount will also propagate to this mount point.

  • MS_PRIVATE

This is the converse of a shared mount point. The mount point does not propagate events to any peers, and does not receive propagation events from any peers.

  • MS_SLAVE

This propagation type sits midway between shared and private. A slave mount has a master—a shared peer group whose members propagate mount and unmount events to the slave mount. However, the slave mount does not propagate events to the master peer group.

  • MS_UNBINDABLE

This mount point is unbindable. Like a private mount point, this mount point does not propagate events to or from peers. In addition, this mount point can’t be the source for a bind mount operation.

几点注意事项:

(1) propagation type是对每个挂载点的设置.

(2) propagation type决定挂载点的直属(immediately under)子挂载点mount/umount事件的传播.比如,挂载点X下创建新的挂载点Y,Y会扩展到X的peer group,但是X不会影响Y下面的子挂载点。

(3)

Peer groups

peer group就是一些可以相互传播mount/umount事件的挂载点集合. 对于shared挂载点,当创建新的mount namspace或者作为bind mount的源目标时,就会创建新的成员。这两种情况都会创建新的挂载点,新的挂载点与原来的挂载点构成peer group。同理,当mount namespace释放时,或者挂载点umount时,会从对应的peer group删除。

A peer group is a set of mount points that propagate mount and unmount events to one another. A peer group acquires new members when a mount point whose propagation type is shared is either replicated during the creation of a new namespace or is used as the source for a bind mount. In both cases, the new mount point is made a member of the same peer group as the existing mount point. Conversely, a mount point ceases to be a member of a peer group when it is unmounted, either explicitly, or implicitly when a mount namespace is torn down because the last member process terminates or moves to another namespace.

  • 示例

在sh1执行:将/设置为private,并创建2个shared的挂载点:

sh1# mount --make-private / 
sh1# mount --make-shared /dev/sda3 /X 
sh1# mount --make-shared /dev/sda5 /Y 

然后在sh2执行:创建新的mount namespace:

sh2# unshare -m --propagation unchanged sh 

然后再在sh1执行:X bind mount to Z:

sh1# mkdir /Z 
sh1# mount --bind /X /Z 

这会创建2个peer group:

  • 第一个peer group包含X, X’, 和 Z。其中,X和X’是因为namespace的创建,X和Z是因为bind mount产生的。
  • 第二个peer group只包含Y, Y’。

注意,因为/private的,所以,bind mount Z并不会传播到第二个namespace。

来看Docker使用private的代码:

// InitializeMountNamespace sets up the devices, mount points, and filesystems for use inside a
// new mount namespace.
func InitializeMountNamespace(rootfs, console string, sysReadonly bool, mountConfig *MountConfig) error {
	var (
		err  error
		flag = syscall.MS_PRIVATE
	)

	if mountConfig.NoPivotRoot {
		flag = syscall.MS_SLAVE   ///容器中的mount事件不会传播到init ns
	}

	if err := syscall.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil { ///将/设置为private,与init ns完全隔离
		return fmt.Errorf("mounting / with flags %X %s", (flag | syscall.MS_REC), err)
	}

	if err := syscall.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
		return fmt.Errorf("mouting %s as bind %s", rootfs, err)
	}
///...

Reference

]]>
TCP SYN cookies make window size suddenly becomes smaller 2017-03-03T20:00:30+00:00 hustcat http://hustcat.github.io/tcp_syn_cookies_and_window_size 问题

最近,业务反映了一个TCP连接容窗口突然异常变小的问题,引起数据传输速度异常之慢,如下:

在server端回SYN-ACK包,窗口大小还是144800,在server端确认client的第一个数据包时,一下子变成了60,但数据包的长度只有86个字节。

经过和几位同事的一起各种定位,最终发现是TCP SYN cookies引起的。简单总结一下,以示后人。

TCP引入SYN cookies是为了解决SYN flood问题。

SYN cookie is a technique used to resist SYN flood attacks.The technique’s primary inventor Daniel J. Bernstein defines SYN cookies as “particular choices of initial TCP sequence numbers by TCP servers.” In particular, the use of SYN cookies allows a server to avoid dropping connections when the SYN queue fills up. Instead, the server behaves as if the SYN queue had been enlarged. The server sends back the appropriate SYN+ACK response to the client but discards the SYN queue entry. If the server then receives a subsequent ACK response from the client, the server is able to reconstruct the SYN queue entry using information encoded in the TCP sequence number.

内核参数

  • sysctl_max_syn_backlog

sysctl_max_syn_backlog控制Listen Socket的半连接(SYN_RECV)队列长度:

struct inet_connection_sock {

	struct request_sock_queue icsk_accept_queue; ////SYN_RECV sockets queue
}

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
	struct inet_sock *inet = inet_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
///...
	sk->sk_state = TCP_LISTEN;
}

int reqsk_queue_alloc(struct request_sock_queue *queue,
		      unsigned int nr_table_entries)
{ 
///...
	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
	
	for (lopt->max_qlen_log = 3;
	     (1 << lopt->max_qlen_log) < nr_table_entries;
	     lopt->max_qlen_log++);
///...
}

static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
	return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
}

内核会根据listen的backlog和sysctl_max_syn_backlog计算listen socket的SYN queue的长度。如果队列满了,就会输出下面的日志:

TCP: TCP: Possible SYN flooding on port 6000. Dropping request.  Check SNMP counters.
  • sysctl_tcp_syncookies

控制是否启动TCP SYN cookies机制。

extern int sysctl_tcp_syncookies;

TCP处理新建连接的逻辑

当接收端收到发送端的SYN包之后,会创建一个request_sock,再给发送端返回SYN/ACK包后,将request_sock加入到LISTEN socket的SYN table:

tcp_v4_do_rcv(TCP_LISTEN) -> tcp_rcv_state_process -> tcp_v4_conn_request:

///ipv4_specific, LISTEN socket handle SYN packet
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
        struct request_sock *req;
///...
	/* TW buckets are converted to open requests without
	 * limitations, they conserve resources and peer is
	 * evidently real one.
	 */
	if (inet_csk_reqsk_queue_is_full(sk) && !isn) { ///SYN queue is full
		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
		if (!want_cookie) ///no tcp_syncookies, drop SKB
			goto drop;
	}

	/* Accept backlog is full. If we have already queued enough
	 * of warm entries in syn queue, drop request. It is better than
	 * clogging syn queue with openreqs with exponentially increasing
	 * timeout.
	 */
	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
		goto drop;
	}
	
	req = inet_reqsk_alloc(&tcp_request_sock_ops);
	if (!req)
		goto drop;
///...
	if (likely(!do_fastopen)) {
		int err;
		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, ///send SYN/ACK
		     ireq->rmt_addr, ireq->opt);
		err = net_xmit_eval(err);
		if (err || want_cookie) ///tcp_syncookies, don't add to SYN queue
			goto drop_and_free;

		tcp_rsk(req)->snt_synack = tcp_time_stamp;
		tcp_rsk(req)->listener = NULL;
		/* Add the request_sock to the SYN table */
		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); ///Add SYN table
		if (fastopen_cookie_present(&foc) && foc.len != 0)
			NET_INC_STATS_BH(sock_net(sk),
			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) ///fast open
		goto drop_and_free;
///...
}

当接收端再次收到发送端的ACK包时,内核会从SYN table找到与之对应的tcp_check_req,然后创建新的socket,至此,TCP连接算是完成建立(TCP_ESTABLISHED): tcp_v4_do_rcv(TCP_LISTEN) -> tcp_v4_hnd_req -> tcp_check_req:

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
	struct tcphdr *th = tcp_hdr(skb);
	const struct iphdr *iph = ip_hdr(skb);
	struct sock *nsk;
	struct request_sock **prev;
	/* Find possible connection requests. */
	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
						       iph->saddr, iph->daddr); ///get request_sock from SYN table
	if (req)
		return tcp_check_req(sk, skb, req, prev, false); /// create new socket
///...
}

SYN cookies

在没有开启tcp_syncookies选项时,如果LISTEN socket的SYN queue满之后,会直接丢掉SKB:

///ipv4_specific, LISTEN socket handle SYN packet
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
///..
	if (inet_csk_reqsk_queue_is_full(sk) && !isn) { ///SYN queue is full
		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
		if (!want_cookie) ///no tcp_syncookies, drop SKB
			goto drop;
	}

开启tcp_syncookies之后,如果LISTEN socket的SYN queue满之后,会创建request_sock,再返给对端SYN/ACK后,并不会将request_sock对象加到SYN queue,而是将其释放:

	if (likely(!do_fastopen)) {
		int err;
		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, ///send SYN/ACK
		     ireq->rmt_addr, ireq->opt);
		err = net_xmit_eval(err);
		if (err || want_cookie) ///tcp_syncookies, don't add to SYN queue
			goto drop_and_free;

这样,当收到对端的ACK后,tcp_v4_hnd_req从SYN queue找不到对应的request_sock对象,就会进入syncookies的处理逻辑: tcp_v4_do_rcv -> tcp_v4_hnd_req:

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
///...
#ifdef CONFIG_SYN_COOKIES
	if (!th->syn)
		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
	return sk;
}

cookie_v4_check会检查cookies是否有效,并创建新的request_sock对象,进入正常连接的流程。

SYN cookies与TCP options

对于走SYN cookies逻辑的连接,由于内核没有保存相关socket的状态,所以,SYN包中携带的TCP options就会丢失。

  • MSS

接收端在向发送端返回cookies时,会将MSS的值编码到cookies,发送端在返回cookies后,接收端调用cookie_v4_check获取MSS的值:

struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
			     struct ip_options *opt)
{
///...
	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
		goto out;

	if (tcp_synq_no_recent_overflow(sk) ||
	    (mss = cookie_check(skb, cookie)) == 0) { ///mss option value
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
		goto out;
	}

///...
	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
	if (!req)
		goto out;
///...
	/* Try to redo what tcp_v4_send_synack did. */
	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
	///initial window
	tcp_select_initial_window(tcp_full_space(sk), req->mss,
				  &req->rcv_wnd, &req->window_clamp,
				  ireq->wscale_ok, &rcv_wscale,
				  dst_metric(&rt->dst, RTAX_INITRWND));

	ireq->rcv_wscale  = rcv_wscale;
///...
}
  • wscale

但是,对于其它option,比如wscaleSACK等信息,就会丢失。后来,又使用timestamp来保存wscale,后来又取消了,参考12。详细参考Improving syncookies

对于TCP SYN cookies的处理逻辑,接收端在收到对端的ACK之后,会重新计算wscale,而不是TCP在建立连接的SYN/SYN-ACK过程协商的wscale,由于wscale的计算受recv buffer等参数的影响,会导致第二次计算的wscale与前面协商的不一致,从而导致发送端和接收端的wscale不一致:

void tcp_select_initial_window(int __space, __u32 mss,
			       __u32 *rcv_wnd, __u32 *window_clamp,
			       int wscale_ok, __u8 *rcv_wscale,
			       __u32 init_rcv_wnd)
{
	unsigned int space = (__space < 0 ? 0 : __space); ///sk_rcvbuf size
///...
	(*rcv_wscale) = 0;
	if (wscale_ok) {
		/* Set window scaling on max possible window
		 * See RFC1323 for an explanation of the limit to 14
		 */
		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
		space = min_t(u32, space, *window_clamp);
		while (space > 65535 && (*rcv_wscale) < 14) {
			space >>= 1;
			(*rcv_wscale)++;
		}
	}

而TCP的窗口大小,是受到wscale影响的,从而就会导致出现开头的问题。

总结

这本来是一个很简单的问题,但定位过程却走了不少弯路,从一开始就聚焦于TCP窗口机制,企图从中找问题,而忽略了内核的一些关键输出。再次说明了那个问题:* 表面上复杂的问题,背后的原因都非常简单!*

不管怎样,目前内核的TCP SYN cookies机制是有缺陷的,请慎用。

Reference

]]>
Dive deep into inotify and overlayfs 2017-01-06T11:00:30+00:00 hustcat http://hustcat.github.io/dive-into-inotify-and-overlayfs Introduction

应用层可以使用内核提供的文件系统通知API来获取文件系统中发生的变化,比如打开、关闭、创建、删除文件(夹)等。内核最开始在2.4.0中实现了dnotify,但dnotify重用了fcntl系统调用,有很多问题,比如:(1)dnotify只能监控文件夹,不能监控某个文件;(2)使用信号SIGIO来向进程传递事件,但信号是异步的,可能丢失,而且传递的信息太少,比如,无法知道到底是文件夹的哪个文件发生的事件。

后面,内核在2.6.13实现了inotifyinotify实现了几个新的系统调用,解决了dnotify的问题。

  • inotifywait

我们可以使用inotify-tools中自带的inotifywait来监控某个目录的事件。

#inotifywait -rme modify,open,create,delete,close /root/dbyin/test/
Setting up watches.  Beware: since -r was given, this may take a while!
Watches established.
/root/dbyin/test/ CREATE f1.txt
/root/dbyin/test/ OPEN f1.txt
/root/dbyin/test/ MODIFY f1.txt
/root/dbyin/test/ CLOSE_WRITE,CLOSE f1.txt
/root/dbyin/test/ DELETE f1.txt

Another terminal:

#echo hello > /root/dbyin/test/f1.txt
#rm /root/dbyin/test/f1.txt

程序示例参考这里

Inotify的实现

核心数据结构

  • fsnotify_group

fsnotify_group代表一个inotify实例,每次应用层调用inotify_init就会创建一个实例,它维护该实例的所有event信息:

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {

	const struct fsnotify_ops *ops;	/* how this group handles things, inotify_fops */
	
	struct list_head notification_list;	/* list of event_holder this group needs to send to userspace, fsnotify_event list */
	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
	unsigned int q_len;			/* events on the queue */
	unsigned int max_events;		/* maximum events allowed on the list */

	struct list_head marks_list;	/* all inode marks for this group, struct fsnotify_mark list */

	struct fasync_struct    *fsn_fa;    /* async notification */

	/* groups can define private fields here or use the void *private */
	union {
		void *private;
#ifdef CONFIG_INOTIFY_USER
		struct inotify_group_private_data {
			spinlock_t	idr_lock;
			struct idr      idr;   ///id -> inotify_inode_mark*
			struct user_struct      *user;
		} inotify_data; ///for inotify
#endif
	}
}
  • fsnotify_mark

fsnotify_mark是联系fsnotify_groupinode的桥梁,fsnotify_group->marks_listfsnotify_mark链表,fsnotify_mark.i->inode指向被监听文件的inode。inode->i_fsnotify_marks保存监听该inode的所有inotify实例。

struct inotify_inode_mark {
	struct fsnotify_mark fsn_mark;
	int wd; ///watch descriptor
};


struct fsnotify_mark {
	__u32 mask;			/* mask this mark is for */
	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
	 * in kernel that found and may be using this mark. */
	atomic_t refcnt;		/* active things looking at this mark */
	struct fsnotify_group *group;	/* group this mark is for */
	struct list_head g_list;	/* list of marks by group->i_fsnotify_marks */
	spinlock_t lock;		/* protect group and inode */
	union {
		struct fsnotify_inode_mark i;
		struct fsnotify_vfsmount_mark m;
	};
	__u32 ignored_mask;		/* events types to ignore */
#define FSNOTIFY_MARK_FLAG_INODE		0x01
#define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
#define FSNOTIFY_MARK_FLAG_ALIVE		0x10
	unsigned int flags;		/* vfsmount or inode mark? */
	struct list_head destroy_list;
	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
};

/*
 * Inode specific fields in an fsnotify_mark
 */
struct fsnotify_inode_mark {
	struct inode *inode;		/* inode this mark is associated with */
	struct hlist_node i_list;	/* list of marks by inode->i_fsnotify_marks */
	struct list_head free_i_list;	/* tmp list used when freeing this mark */
};
  • inode and file
/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {

#ifdef CONFIG_FSNOTIFY
	__u32			i_fsnotify_mask; /* all events this inode cares about */
	struct hlist_head	i_fsnotify_marks; ///struct fsnotify_inode_mark list, see fsnotify_inode_mark.i_list
#endif 
}


struct file {
    ///...
	
	void			*private_data; ///fsnotify_group*
}

Overlayfs的实现

  • 数据结构

Overlayfs的几个关键数据结构:

struct dentry {
	struct dentry *d_parent;	/* parent directory,父目录dentry对象 */
	struct qstr d_name;   ///当前分量的名称
	struct inode *d_inode;		/* inode对象, create by ovl_new_inode */
	
	const struct dentry_operations *d_op; /// == super_block->s_d_op == ovl_dentry_operations
	struct super_block *d_sb;	/* The root of the dentry tree */

	void *d_fsdata;			/* fs-specific data, struct ovl_entry */
}


/* private information held for every overlayfs dentry */
struct ovl_entry {
	struct dentry *__upperdentry; ///not NULL if got in upperdir
	struct ovl_dir_cache *cache;
	union {
		struct {
			u64 version;
			bool opaque;
		};
		struct rcu_head rcu;
	};
	unsigned numlower;
	struct path lowerstack[]; ///not NULL if got in lowdir
};


struct inode {
	const struct inode_operations	*i_op; ///ovl_dir_inode_operations
	struct super_block	*i_sb;
	
	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops, ovl_dir_operations */
	
	void			*i_private; /* fs or device private pointer,  struct ovl_entry*/
};

dentry是内核的目录项对象,每个目录(文件)都有一个对应的对象,对于overlayfs的每个dentry的指向的inode并没有实际的磁盘数据,而是由ovl_new_inode创建的一个内存inode;dentry->d_fsdata指向ovl_entry,而后者指向真正的underlay fs的dentry。

在overlayfs遍历时,dentry->inode并没有多大用,实际上,在ovl_lookup中,代表父目录的inode参数struct inode *dir并没有没使用到。而dentry->d_fsdata指向ovl_entry才是进行查找的关键因素,通过ovl_entry进入到underlay fs的查找。

///dir: parent directory inode object, dentry: dentry object for current finding dircotry entry
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
			  unsigned int flags) ///called by lookup_real
{
	struct ovl_entry *oe;
	struct ovl_entry *poe = dentry->d_parent->d_fsdata; ///dentry->d_parent->d_inode == dir
	struct path *stack = NULL;
	struct dentry *upperdir, *upperdentry = NULL;
	unsigned int ctr = 0;
	struct inode *inode = NULL;
	bool upperopaque = false;
	struct dentry *this, *prev = NULL;
	unsigned int i;
	int err;

	upperdir = ovl_upperdentry_dereference(poe);
	if (upperdir) { ///(1)lookup in upperdir firstly
		this = ovl_lookup_real(upperdir, &dentry->d_name);
		err = PTR_ERR(this);
		if (IS_ERR(this))
			goto out;

		if (this) {///exist in upperdir
			if (unlikely(ovl_dentry_remote(this))) {
				dput(this);
				err = -EREMOTE;
				goto out;
			}
			if (ovl_is_whiteout(this)) {
				dput(this); ///whiteout file
				this = NULL;
				upperopaque = true;
			} else if (poe->numlower && ovl_is_opaquedir(this)) {
				upperopaque = true; ///opaque dir
			}
		}
		upperdentry = prev = this;
	}
	///(2)didn't find dentry in upperdir
	if (!upperopaque && poe->numlower) {
		err = -ENOMEM;
		stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
		if (!stack)
			goto out_put_upper;
	}
	///(3)find dentry in lowdir
	for (i = 0; !upperopaque && i < poe->numlower; i++) {
		bool opaque = false;
		struct path lowerpath = poe->lowerstack[i];

		this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
		err = PTR_ERR(this);
		if (IS_ERR(this)) {
			/*
			 * If it's positive, then treat ENAMETOOLONG as ENOENT.
			 */
			if (err == -ENAMETOOLONG && (upperdentry || ctr))
				continue;
			goto out_put;
		}
		if (!this)
			continue;
		if (ovl_is_whiteout(this)) {
			dput(this);
			break;
		}
		/*
		 * Only makes sense to check opaque dir if this is not the
		 * lowermost layer.
		 */
		if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
			opaque = true;

		if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
			     !S_ISDIR(this->d_inode->i_mode))) {
			/*
			 * FIXME: check for upper-opaqueness maybe better done
			 * in remove code.
			 */
			if (prev == upperdentry)
				upperopaque = true;
			dput(this);
			break;
		}
		/*
		 * If this is a non-directory then stop here.
		 */
		if (!S_ISDIR(this->d_inode->i_mode))
			opaque = true;

		stack[ctr].dentry = this;
		stack[ctr].mnt = lowerpath.mnt;
		ctr++;
		prev = this;
		if (opaque)
			break;
	}

	oe = ovl_alloc_entry(ctr); ///ovl_dentry for current finding dentry
	err = -ENOMEM;
	if (!oe)
		goto out_put;

	if (upperdentry || ctr) {///if got in upperdir, upperdentry != NULL; else if got in lowdir, ctr > 0
		struct dentry *realdentry;

		realdentry = upperdentry ? upperdentry : stack[0].dentry;
		///alloc overlayfs inode for current real inode
		err = -ENOMEM;
		inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
				      oe);
		if (!inode)
			goto out_free_oe;
		ovl_copyattr(realdentry->d_inode, inode);
	}

	oe->opaque = upperopaque;
	oe->__upperdentry = upperdentry;
	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
	kfree(stack);
	dentry->d_fsdata = oe; ///ovl_entry
	d_add(dentry, inode);

	return NULL;

out_free_oe:
	kfree(oe);
out_put:
	for (i = 0; i < ctr; i++)
		dput(stack[i].dentry);
	kfree(stack);
out_put_upper:
	dput(upperdentry);
out:
	return ERR_PTR(err);
}
  • open and copy up

overlayfs在打开文件时,会让struct file->f_inode指向real inode;而且,如果会修改文件,且upperdir不存在该文件,则会从lowerdir进行copy up:

int vfs_open(const struct path *path, struct file *file,
            const struct cred *cred)
{
	struct dentry *dentry = path->dentry; ///overlayfs dentry
	struct inode *inode = dentry->d_inode; ///overlayfs inode

	file->f_path = *path;
	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
		inode = dentry->d_op->d_select_inode(dentry, file->f_flags); ///get real inode, ovl_dentry_operations
		if (IS_ERR(inode))
			return PTR_ERR(inode);
	}

	return do_dentry_open(file, inode, NULL, cred); ///file->f_inode = inode
}

///return underlay fs inode
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
{
	int err;
	struct path realpath;
	enum ovl_path_type type;

	if (S_ISDIR(dentry->d_inode->i_mode))
		return dentry->d_inode;

	type = ovl_path_real(dentry, &realpath); ///real dentry
	if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { ///need copy up
		err = ovl_want_write(dentry);
		if (err)
			return ERR_PTR(err);

		if (file_flags & O_TRUNC)
			err = ovl_copy_up_truncate(dentry);
		else
			err = ovl_copy_up(dentry); ///copy up
		ovl_drop_write(dentry);
		if (err)
			return ERR_PTR(err);

		ovl_path_upper(dentry, &realpath);
	}

	if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
		return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);

	return realpath.dentry->d_inode; ///return real inode
}

Inotify and Overlayfs

inotify_add_watch使用的是overlayfs inode:

SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
		u32, mask)
{

///...
	ret = inotify_find_inode(pathname, &path, flags); ///返回overlayfs inode
	if (ret)
		goto fput_and_out;

	/* inode held in place by reference to path; group by fget on fd */
	inode = path.dentry->d_inode; ///monitored file(overlay inode)
	group = f.file->private_data; ///notify group

	/* create/update an inode mark */
	ret = inotify_update_watch(group, inode, mask);

///...
}

fsnotify_open使用的是underlayfs inode:

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
	struct path *path = &file->f_path;
	struct inode *inode = file_inode(file); ///for overlayfs , after vfs_open, f->f_inode == underlay inode
	__u32 mask = FS_OPEN;

	if (S_ISDIR(inode->i_mode))
		mask |= FS_ISDIR;

	fsnotify_parent(path, NULL, mask);
	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
}

vfs_open中,内核会将file->f_inode指向underlayfs inode:

int vfs_open(const struct path *path, struct file *file,
            const struct cred *cred)
{
	struct dentry *dentry = path->dentry; ///overlayfs dentry
	struct inode *inode = dentry->d_inode; ///overlayfs inode

	file->f_path = *path;
	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
		inode = dentry->d_op->d_select_inode(dentry, file->f_flags); ///get underlayfs inode, ovl_dentry_operations
		if (IS_ERR(inode))
			return PTR_ERR(inode);
	}

	return do_dentry_open(file, inode, NULL, cred); ///file->f_inode = inode
}

所以,对单个文件进行watch时,无法得到事件。

Reference

]]>
Socket activation in systemd 2017-01-05T11:00:30+00:00 hustcat http://hustcat.github.io/socket-activation-in-systemd Introduction

systemd为了加快系统的启动速度,使用socket activation的方式让所有系统服务并发启动。socket activation的思想由来所久,inetd使用它来实现按需启动网络服务。

socket activation的核心在于将创建listen socket的过程从service daemon移到systemd,即使该服务本身没有启动,其它依赖的服务也可以连接listen socket,然后systemd创建服务进程,并将listen socket转给该daemon进程,由后者处理listen socket的各种请求。这样使得所有的服务守护进程都可以同时启动。

Socket activation makes it possible to start all four services completely simultaneously, without any kind of ordering. Since the creation of the listening sockets is moved outside of the daemons themselves we can start them all at the same time, and they are able to connect to each other’s sockets right-away.

Write socket activation daemon

使用socket activation的服务,必须从systemd接收socket,而不是自己创建socket.

A service capable of socket activation must be able to receive its preinitialized sockets from systemd, instead of creating them internally.

  • NON-SOCKET-ACTIVATABLE SERVICE

socket activation的服务一般是自己创建socket:

/* Source Code Example #1: ORIGINAL, NOT SOCKET-ACTIVATABLE SERVICE */
...
union {
        struct sockaddr sa;
        struct sockaddr_un un;
} sa;
int fd;

fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd < 0) {
        fprintf(stderr, "socket(): %m\n");
        exit(1);
}

memset(&sa, 0, sizeof(sa));
sa.un.sun_family = AF_UNIX;
strncpy(sa.un.sun_path, "/run/foobar.sk", sizeof(sa.un.sun_path));

if (bind(fd, &sa.sa, sizeof(sa)) < 0) {
        fprintf(stderr, "bind(): %m\n");
        exit(1);
}

if (listen(fd, SOMAXCONN) < 0) {
        fprintf(stderr, "listen(): %m\n");
        exit(1);
}
...

这种方式下,其它依赖的服务必须在该daemon进程启动后之能访问该服务。

  • SOCKET-ACTIVATABLE SERVICE
/* Source Code Example #2: UPDATED, SOCKET-ACTIVATABLE SERVICE */
...
#include "sd-daemon.h"
...
int fd;

if (sd_listen_fds(0) != 1) { ///使用systemd创建的socket
        fprintf(stderr, "No or too many file descriptors received.\n");
        exit(1);
}

fd = SD_LISTEN_FDS_START + 0;
...

这种方式下,传统的启动服务服务方式将不再可用。为了兼容两种方式,可以使用下面的方法:

  • SOCKET-ACTIVATABLE SERVICE WITH COMPATIBILITY
/* Source Code Example #3: UPDATED, SOCKET-ACTIVATABLE SERVICE WITH COMPATIBILITY */
...
#include "sd-daemon.h"
...
int fd, n;

n = sd_listen_fds(0);
if (n > 1) {
        fprintf(stderr, "Too many file descriptors received.\n");
        exit(1);
} else if (n == 1)
        fd = SD_LISTEN_FDS_START + 0;
else {
        union {
                struct sockaddr sa;
                struct sockaddr_un un;
        } sa;

        fd = socket(AF_UNIX, SOCK_STREAM, 0);
        if (fd < 0) {
                fprintf(stderr, "socket(): %m\n");
                exit(1);
        }

        memset(&sa, 0, sizeof(sa));
        sa.un.sun_family = AF_UNIX;
        strncpy(sa.un.sun_path, "/run/foobar.sk", sizeof(sa.un.sun_path));

        if (bind(fd, &sa.sa, sizeof(sa)) < 0) {
                fprintf(stderr, "bind(): %m\n");
                exit(1);
        }

        if (listen(fd, SOMAXCONN) < 0) {
                fprintf(stderr, "listen(): %m\n");
                exit(1);
        }
}
...

完整程序参考这里。另外,这里还有一个Go语言的示例。

  • Enable service in systemd

创建socket unit file:

# cat /etc/systemd/system/foobar.socket 
[Socket]
ListenStream=/run/foobar.sk

[Install]
WantedBy=sockets.target

创建对应的service file:

# cat /etc/systemd/system/foobar.service 
[Service]
ExecStart=/usr/local/bin/foobard

启动socket:

# systemctl enable foobar.socket
# systemctl start foobar.socket
# systemctl status foobar.socket
● foobar.socket
   Loaded: loaded (/etc/systemd/system/foobar.socket; enabled; vendor preset: disabled)
   Active: active (listening) since 四 2017-01-05 18:59:41 CST; 31s ago
   Listen: /run/foobar.sk (Stream)

1月 05 18:59:41 centos7 systemd[1]: Listening on foobar.socket.
1月 05 18:59:41 centos7 systemd[1]: Starting foobar.socket.

# lsof /run/foobar.sk 
COMMAND PID USER   FD   TYPE             DEVICE SIZE/OFF  NODE NAME
systemd   1 root   28u  unix 0xffff88002db53400      0t0 29058 /run/foobar.sk

可以看到systemd创建了对应的socket。但此时,foobard进程并没有启动。

当我们连接/run/foobar.sk时,foobard进程就会被systemd拉起:

# socat - unix-connect:/run/foobar.sk 
hello world
hello world
again
again


# ps -ef|grep foob
root      3589  3338  0 19:01 pts/1    00:00:00 socat - unix-connect:/run/foobar.sk
root      3590     1  0 19:01 ?        00:00:00 /usr/local/bin/foobard

Internal

systemd在启动服务进程前,会设置环境变量LISTEN_PIDLISTEN_FDS:

static int build_environment(
                const ExecContext *c,
                unsigned n_fds,
                usec_t watchdog_usec,
                const char *home,
                const char *username,
                const char *shell,
                char ***ret) {

        _cleanup_strv_free_ char **our_env = NULL;
        unsigned n_env = 0;
        char *x;

        assert(c);
        assert(ret);

        our_env = new0(char*, 10);
        if (!our_env)
                return -ENOMEM;

        if (n_fds > 0) {
                if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid()) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;

                if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
                        return -ENOMEM;
                our_env[n_env++] = x;
        }
...

服务进程通过sd_listen_fds获取对应的环境变量:

_public_ int sd_listen_fds(int unset_environment) {
        const char *e;
        int n, r, fd;
        pid_t pid;

        e = getenv("LISTEN_PID");
        if (!e) {
                r = 0;
                goto finish;
        }

        r = parse_pid(e, &pid);
        if (r < 0)
                goto finish;

        /* Is this for us? */
        if (getpid() != pid) {
                r = 0;
                goto finish;
        }

        e = getenv("LISTEN_FDS");
        if (!e) {
                r = 0;
                goto finish;
        }
...

Reference

]]>
Getting started with D-BUS 2017-01-04T19:00:30+00:00 hustcat http://hustcat.github.io/getting-started-with-dbus Introduction

D-BUS是一种进程间通信(IPC)机制,一般主要用于基于AF_UNIX套接字的本地进程间通信(local IPC)(当然也可以基于TCP/IP)实现跨主机的通信。

Linux上已经存在很多local IPC机制,比如FIFO、UNIX套接字等,为什么还要搞一个D-BUS呢?实际上,D-BUS采用了RPC(Remote Procedure Calling)的思想,它相当于本机的RPC,RPC相对于原始的FIFO、unix socket,是更加现代的通信方式,RPC框架本身会负责消息的编解码、安全验证等。这些会大大简化应用程序的开发。

D-Bus包含下面一些内容:

(1) libdbus: a low-level library

(2) dbus-daemon: a daemon based on libdbus. Handles and controls data transfers between DBus peers

(3) two types of busses: a system and a session one. Each bus instance is managed by a dbus-daemon

(4) a security mechanism using policy files

值得一提的是systemd没有使用libdbus,而是使用自己实现的library。

D-Bus Concepts

D-BUS协议是一个端到端(peer-to-peer or client-server)的通信协议,它包含一些基本的概念

  • 总线(bus)

相当于D-BUS的通信链路,应用之间通过总线进行通信。应用在总线上寻找service。有两种总线:

A “system bus” for notifications from the system to user sessions, and to allow the system to request input from user sessions.

A “session bus” used to implement desktop environments such as GNOME and KDE.

  • 服务(service)

服务是提供IPC API的程序,每个服务都有一个reverse domain name结构的标识名称。比如org.freedesktop.NetworkManager对应系统总线上的NetworkManagerorg.freedesktop.login1对应系统总线上的systemd-logind

  • 对象(object)

相当于通信的地址,每个serviceobject都通过object path来标识,object path类似文件系统的路径。比如/org/freedesktop/login1是服务org.freedesktop.login1manager对象的路径。

  • 接口(interfaces)

每个object包含一个或者多个interfaces。D-Bus interfaces define the methods and signals supported by D-Bus objects。

  • 方法(method)

D-Bus methods may accept any number of arguments and may return any number of values, including none.

  • signal

D-Bus signals provide a 1-to-many, publish-subscribe mechanism. Similar to method return values, D-Bus signals may contain an arbitrary ammount of data. Unlike methods however, signals are entirely asynchronous and may be emitted by D-Bus objects at any time.

  • signature

signature用于描述参数的数据类型,比如s代表UTF-8字符串。

更多概念参考DBus Overview

From shell

有一些D-BUS相关的工具,比如busctlgdbusdbus-send等,通过这些工具可以进行一些D-BUS相关的操作。

列出所有连接系统总线的端点:

# busctl
NAME                                   PID PROCESS         USER             CONNECTION    UNIT                      SESSION    DESCR
:1.0                                     1 systemd         root             :1.0          init.scope                -          -    
:1.1                                   516 polkitd         polkitd          :1.1          polkit.service            -          -    
:1.10                                 1305 busctl          root             :1.10         sshd.service              -          -    
:1.2                                   719 tuned           root             :1.2          tuned.service             -          -    
com.redhat.tuned                       719 tuned           root             :1.2          tuned.service             -          -    
fi.epitest.hostap.WPASupplicant          - -               -                (activatable) -                         -         
fi.w1.wpa_supplicant1                    - -               -                (activatable) -                         -         
org.freedesktop.DBus                     - -               -                -             -                         -          -    
org.freedesktop.PolicyKit1             516 polkitd         polkitd          :1.1          polkit.service            -          -    
org.freedesktop.hostname1                - -               -                (activatable) -                         -         
org.freedesktop.import1                  - -               -                (activatable) -                         -         
org.freedesktop.locale1                  - -               -                (activatable) -                         -         
org.freedesktop.login1                   - -               -                (activatable) -                         -         
org.freedesktop.machine1                 - -               -                (activatable) -                         -         
org.freedesktop.network1                 - -               -                (activatable) -                         -         
org.freedesktop.resolve1                 - -               -                (activatable) -                         -         
org.freedesktop.systemd1                 1 systemd         root             :1.0          init.scope                -          -    
org.freedesktop.timedate1                - -               -                (activatable) -                         -         

D-Bus API of systemd

systemd提供了一些D-BUS API,通过API,我们可以进行systemd的各种操作,比如启动、停止服务等。

查看所有的API:

# gdbus introspect --system --dest org.freedesktop.systemd1 --object-path /org/freedesktop/systemd1
node /org/freedesktop/systemd1 {
  interface org.freedesktop.DBus.Peer {
    methods:
      Ping();
      GetMachineId(out s machine_uuid);
    signals:
    properties:
  };
  interface org.freedesktop.DBus.Introspectable {
    methods:
      Introspect(out s data);
    signals:
    properties:
  };
  interface org.freedesktop.DBus.Properties {
    methods:
      Get(in  s interface,
          in  s property,
          out v value);
      GetAll(in  s interface,
             out a{sv} properties);
      Set(in  s interface,
          in  s property,
          in  v value);
    signals:
      PropertiesChanged(s interface,
                        a{sv} changed_properties,
                        as invalidated_properties);
    properties:
  };
  interface org.freedesktop.systemd1.Manager {
    methods:
      GetUnit(in  s arg_0,
              out o arg_1);
      GetUnitByPID(in  u arg_0,
                   out o arg_1);
      LoadUnit(in  s arg_0,
               out o arg_1);
      StartUnit(in  s arg_0,
                in  s arg_1,
                out o arg_2);
      StartUnitReplace(in  s arg_0,
                       in  s arg_1,
                       in  s arg_2,
                       out o arg_3);
      StopUnit(in  s arg_0,
               in  s arg_1,
               out o arg_2);
      ReloadUnit(in  s arg_0,
                 in  s arg_1,
                 out o arg_2);
      RestartUnit(in  s arg_0,
                  in  s arg_1,
                  out o arg_2);
...

接口org.freedesktop.systemd1.Manager包含了systemd提供的主要操作方法。

  • StartUnit/StopUnit
# busctl --system call org.freedesktop.systemd1 /org/freedesktop/systemd1 org.freedesktop.systemd1.Manager GetUnit s crond.service 
o "/org/freedesktop/systemd1/unit/crond_2eservice"


# busctl --system call org.freedesktop.systemd1 /org/freedesktop/systemd1 org.freedesktop.systemd1.Manager StopUnit ss crond.service replace
o "/org/freedesktop/systemd1/job/904"

# systemctl status crond.service
● crond.service - Command Scheduler
   Loaded: loaded (/usr/lib/systemd/system/crond.service; enabled; vendor preset: enabled)
   Active: inactive (dead) since 四 2017-01-05 03:13:29 CST; 4s ago
  Process: 656 ExecStart=/usr/sbin/crond -n $CRONDARGS (code=exited, status=0/SUCCESS)
 Main PID: 656 (code=exited, status=0/SUCCESS)
...

# busctl --system call org.freedesktop.systemd1 /org/freedesktop/systemd1 org.freedesktop.systemd1.Manager StartUnit ss crond.service replace
o "/org/freedesktop/systemd1/job/905"

# systemctl status crond.service
● crond.service - Command Scheduler
   Loaded: loaded (/usr/lib/systemd/system/crond.service; enabled; vendor preset: enabled)
   Active: active (running) since 四 2017-01-05 03:13:53 CST; 1s ago
 Main PID: 12277 (crond)
   Memory: 624.0K
   CGroup: /system.slice/crond.service
           └─12277 /usr/sbin/crond -n
...

Reference

]]>
The Go netpoller and timeout 2016-12-30T18:00:30+00:00 hustcat http://hustcat.github.io/go-netpoller-and-timeout 数据结构
  • netFD

netFD是pkg的网络文件描述符。

//net.go
type conn struct {
	fd *netFD
}

//net/fd_unix.go
// Network file descriptor.
type netFD struct {
	// locking/lifetime of sysfd + serialize access to Read and Write methods
	fdmu fdMutex

	// immutable until Close
	sysfd       int ///OS socket fd
	family      int
	sotype      int
	isConnected bool
	net         string
	laddr       Addr
	raddr       Addr

	// wait server
	pd pollDesc
}

//net/fd_poll_runtime.go
type pollDesc struct {
	runtimeCtx uintptr ///point to runtime PollDesc
}
  • PollDesc

PollDesc代表运行时的文件描述符.

///runtime/netpoll.goc
struct PollDesc
{
	PollDesc* link;	// in pollcache, protected by pollcache.Lock

	// The lock protects pollOpen, pollSetDeadline, pollUnblock and deadlineimpl operations.
	// This fully covers seq, rt and wt variables. fd is constant throughout the PollDesc lifetime.
	// pollReset, pollWait, pollWaitCanceled and runtime·netpollready (IO rediness notification)
	// proceed w/o taking the lock. So closing, rg, rd, wg and wd are manipulated
	// in a lock-free way by all operations.
	Lock;		// protectes the following fields
	uintptr	fd;  //OS socket fd, 对应netFD.sysfd
	bool	closing;
	uintptr	seq;	// protects from stale timers and ready notifications
	G*	rg;	// READY, WAIT, G waiting for read or nil
	Timer	rt;	// read deadline timer (set if rt.fv != nil)
	int64	rd;	// read deadline
	G*	wg;	// READY, WAIT, G waiting for write or nil
	Timer	wt;	// write deadline timer
	int64	wd;	// write deadline
	void*	user;	// user settable cookie
};

Read

从网络fd读取数据,goroutine会一直Read,直到发生EAGAIN,则将当前goroutine加到wait队列:

//net/fd_unix.go
func (fd *netFD) Read(p []byte) (n int, err error) {
	if err := fd.readLock(); err != nil {
		return 0, err
	}
	defer fd.readUnlock()
	if err := fd.pd.PrepareRead(); err != nil { ///检查有没有错误
		return 0, &OpError{"read", fd.net, fd.raddr, err}
	}
	for { ///一直Read,直到发生EAGAIN,则将当前goroutine加到wait队列
		n, err = syscall.Read(int(fd.sysfd), p)
		if err != nil {
			n = 0
			if err == syscall.EAGAIN {
				if err = fd.pd.WaitRead(); err == nil { ///等待epoll唤醒
					continue
				}
			}
		}
		err = chkReadErr(n, err, fd)
		break
	}
	if err != nil && err != io.EOF {
		err = &OpError{"read", fd.net, fd.raddr, err}
	}
	return
}
//net/fd_poll_runtime.go
func (pd *pollDesc) Wait(mode int) error {
	res := runtime_pollWait(pd.runtimeCtx, mode)
	return convertErr(res)
}
  • runtime_pollWait
//runtime/netpoll.goc
func runtime_pollWait(pd *PollDesc, mode int) (err int) {
	err = checkerr(pd, mode);
	if(err == 0) {
		// As for now only Solaris uses level-triggered IO.
		if(Solaris)
			runtime·netpollarm(pd, mode);
		while(!netpollblock(pd, mode, false)) {
			err = checkerr(pd, mode);
			if(err != 0)
				break;
			// Can happen if timeout has fired and unblocked us,
			// but before we had a chance to run, timeout has been reset.
			// Pretend it has not happened and retry.
		}
	}
}

// returns true if IO is ready, or false if timedout or closed
// waitio - wait only for completed IO, ignore errors
static bool
netpollblock(PollDesc *pd, int32 mode, bool waitio)
{
	G **gpp, *old;

	gpp = &pd->rg;
	if(mode == 'w')
		gpp = &pd->wg;

	// set the gpp semaphore to WAIT
	for(;;) {
		old = *gpp;
		if(old == READY) {
			*gpp = nil;
			return true;
		}
		if(old != nil)
			runtime·throw("netpollblock: double wait");
		if(runtime·casp(gpp, nil, WAIT))
			break;
	}

	// need to recheck error states after setting gpp to WAIT
	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
	// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
	if(waitio || checkerr(pd, mode) == 0)
		runtime·park((bool(*)(G*, void*))blockcommit, gpp, "IO wait"); ////阻塞当前的goroutine
	// be careful to not lose concurrent READY notification
	old = runtime·xchgp(gpp, nil);
	if(old > WAIT)
		runtime·throw("netpollblock: corrupted state");
	return old == READY;
}

static intgo
checkerr(PollDesc *pd, int32 mode)
{
	if(pd->closing)
		return 1;  // errClosing
	if((mode == 'r' && pd->rd < 0) || (mode == 'w' && pd->wd < 0))
		return 2;  // errTimeout
	return 0;
}

sysmon

sysmonOS线程会定期检查epoll事件,并将ready的G加入到全局队列(值得注意的是并没有直接调用runtime·ready):

//runtime/proc.c
static void
sysmon(void)
{
///..
		// poll network if not polled for more than 10ms
		lastpoll = runtime·atomicload64(&runtime·sched.lastpoll);
		now = runtime·nanotime();
		if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
			runtime·cas64(&runtime·sched.lastpoll, lastpoll, now);
			gp = runtime·netpoll(false);  // non-blocking
			if(gp) {
				// Need to decrement number of idle locked M's
				// (pretending that one more is running) before injectglist.
				// Otherwise it can lead to the following situation:
				// injectglist grabs all P's but before it starts M's to run the P's,
				// another M returns from syscall, finishes running its G,
				// observes that there is no work to do and no other running M's
				// and reports deadlock.
				incidlelocked(-1);
				injectglist(gp); // 加到全局队列
				incidlelocked(1);
			}
		}
}

  • epoll_wait
//runtime/netpoll_epoll.c
// polls for ready network connections
// returns list of goroutines that become runnable
G*
runtime·netpoll(bool block)
{
	static int32 lasterr;
	EpollEvent events[128], *ev;
	int32 n, i, waitms, mode;
	G *gp;

	if(epfd == -1)
		return nil;
	waitms = -1;
	if(!block)
		waitms = 0;
retry:
	n = runtime·epollwait(epfd, events, nelem(events), waitms);
	if(n < 0) {
		if(n != -EINTR && n != lasterr) {
			lasterr = n;
			runtime·printf("runtime: epollwait on fd %d failed with %d\n", epfd, -n);
		}
		goto retry;
	}
	gp = nil;
	for(i = 0; i < n; i++) {
		ev = &events[i];
		if(ev->events == 0)
			continue;
		mode = 0;
		if(ev->events & (EPOLLIN|EPOLLRDHUP|EPOLLHUP|EPOLLERR))
			mode += 'r';
		if(ev->events & (EPOLLOUT|EPOLLHUP|EPOLLERR))
			mode += 'w';
		if(mode)
			runtime·netpollready(&gp, (void*)ev->data, mode);
	}
	if(block && gp == nil)
		goto retry;
	return gp;
}

// Injects the list of runnable G's into the scheduler.
// Can run concurrently with GC.
static void
injectglist(G *glist)
{
	int32 n;
	G *gp;

	if(glist == nil)
		return;
	runtime·lock(&runtime·sched);
	for(n = 0; glist; n++) {
		gp = glist;
		glist = gp->schedlink;
		gp->status = Grunnable;
		globrunqput(gp);
	}
	runtime·unlock(&runtime·sched);

	for(; n && runtime·sched.npidle; n--)
		startm(nil, false);
}
///runtime/netpoll.goc
// make pd ready, newly runnable goroutines (if any) are enqueued info gpp list
void
runtime·netpollready(G **gpp, PollDesc *pd, int32 mode)
{
	G *rg, *wg;

	rg = wg = nil;
	if(mode == 'r' || mode == 'r'+'w')
		rg = netpollunblock(pd, 'r', true);
	if(mode == 'w' || mode == 'r'+'w')
		wg = netpollunblock(pd, 'w', true);
	if(rg) {
		rg->schedlink = *gpp;
		*gpp = rg;
	}
	if(wg) {
		wg->schedlink = *gpp;
		*gpp = wg;
	}
}

static G*
netpollunblock(PollDesc *pd, int32 mode, bool ioready)
{
	G **gpp, *old, *new;

	gpp = &pd->rg;
	if(mode == 'w')
		gpp = &pd->wg;

	for(;;) {
		old = *gpp;
		if(old == READY)
			return nil;
		if(old == nil && !ioready) {
			// Only set READY for ioready. runtime_pollWait
			// will check for timeout/cancel before waiting.
			return nil;
		}
		new = nil;
		if(ioready)
			new = READY;
		if(runtime·casp(gpp, old, new))
			break;
	}
	if(old > WAIT)
		return old;  // must be G*
	return nil;
}

SetReadDeadline

SetReadDeadline用于实现Go的网络IO超时原语,它会给netFD创建对应的IO定时器,当定时器超时,runtime会调用runtime·ready唤醒对应进行Read/Write的goroutine,如果对应的goroutine处于等待的状态(默认情况下deadline为0,不会创建定时器)。

Go并没有使用epoll_wait实现IO的超时,而是通过Set[Read|Write]Deadline(time.Time)对每个netFD设置超时。

SetDeadline设置的定时器超时后,在超时处理函数中,会删除该定时器;而且,每次收到或者发送数据时,也不会reset该定时器。所以,每次Read/Write操作之前,都需要调用该函数:

// A net.Conn that sets a deadline for every Read or Write operation
type TimeoutConn struct {
	net.Conn
	timeout time.Duration
}

func (c *TimeoutConn) Read(b []byte) (int, error) {
	if c.timeout > 0 {
		err := c.Conn.SetReadDeadline(time.Now().Add(c.timeout))
		if err != nil {
			return 0, err
		}
	}
	return c.Conn.Read(b)
}
// SetReadDeadline implements the Conn SetReadDeadline method.
func (c *conn) SetReadDeadline(t time.Time) error {
	if !c.ok() {
		return syscall.EINVAL
	}
	return c.fd.setReadDeadline(t)
}
///net/fd_poll_runtime.go
func (fd *netFD) setReadDeadline(t time.Time) error {
	return setDeadlineImpl(fd, t, 'r')
}

func (fd *netFD) setWriteDeadline(t time.Time) error {
	return setDeadlineImpl(fd, t, 'w')
}

func setDeadlineImpl(fd *netFD, t time.Time, mode int) error {
	d := runtimeNano() + int64(t.Sub(time.Now()))
	if t.IsZero() {
		d = 0
	}
	if err := fd.incref(); err != nil {
		return err
	}
	runtime_pollSetDeadline(fd.pd.runtimeCtx, d, mode)
	fd.decref()
	return nil
}
func runtime_pollSetDeadline(pd *PollDesc, d int64, mode int) {
	G *rg, *wg;

	runtime·lock(pd);
	if(pd->closing) {
		runtime·unlock(pd);
		return;
	}
	pd->seq++;  // invalidate current timers
	// Reset current timers.
	if(pd->rt.fv) {
		runtime·deltimer(&pd->rt);
		pd->rt.fv = nil;
	}
	if(pd->wt.fv) {
		runtime·deltimer(&pd->wt);
		pd->wt.fv = nil;
	}
	// Setup new timers.
	if(d != 0 && d <= runtime·nanotime())
		d = -1;
	if(mode == 'r' || mode == 'r'+'w')
		pd->rd = d;
	if(mode == 'w' || mode == 'r'+'w')
		pd->wd = d;
	if(pd->rd > 0 && pd->rd == pd->wd) {
		pd->rt.fv = &deadlineFn;
		pd->rt.when = pd->rd;
		// Copy current seq into the timer arg.
		// Timer func will check the seq against current descriptor seq,
		// if they differ the descriptor was reused or timers were reset.
		pd->rt.arg.type = (Type*)pd->seq;
		pd->rt.arg.data = pd;
		runtime·addtimer(&pd->rt);
	} else {
		if(pd->rd > 0) {
			pd->rt.fv = &readDeadlineFn;
			pd->rt.when = pd->rd;
			pd->rt.arg.type = (Type*)pd->seq;
			pd->rt.arg.data = pd;
			runtime·addtimer(&pd->rt);
		}
		if(pd->wd > 0) {
			pd->wt.fv = &writeDeadlineFn;
			pd->wt.when = pd->wd;
			pd->wt.arg.type = (Type*)pd->seq;
			pd->wt.arg.data = pd;
			runtime·addtimer(&pd->wt);
		}
	}
	// If we set the new deadline in the past, unblock currently pending IO if any.
	rg = nil;
	runtime·atomicstorep(&wg, nil);  // full memory barrier between stores to rd/wd and load of rg/wg in netpollunblock
	if(pd->rd < 0)
		rg = netpollunblock(pd, 'r', false);
	if(pd->wd < 0)
		wg = netpollunblock(pd, 'w', false);
	runtime·unlock(pd);
	if(rg)
		runtime·ready(rg);
	if(wg)
		runtime·ready(wg);
}

static FuncVal deadlineFn	= {(void(*)(void))deadline};
static FuncVal readDeadlineFn	= {(void(*)(void))readDeadline};
static FuncVal writeDeadlineFn	= {(void(*)(void))writeDeadline};


static void
readDeadline(int64 now, Eface arg)
{
	deadlineimpl(now, arg, true, false);
}

static void
deadlineimpl(int64 now, Eface arg, bool read, bool write)
{
	PollDesc *pd;
	uint32 seq;
	G *rg, *wg;

	USED(now);
	pd = (PollDesc*)arg.data;
	// This is the seq when the timer was set.
	// If it's stale, ignore the timer event.
	seq = (uintptr)arg.type;
	rg = wg = nil;
	runtime·lock(pd);
	if(seq != pd->seq) {
		// The descriptor was reused or timers were reset.
		runtime·unlock(pd);
		return;
	}
	if(read) {
		if(pd->rd <= 0 || pd->rt.fv == nil)
			runtime·throw("deadlineimpl: inconsistent read deadline");
		pd->rd = -1;
		runtime·atomicstorep(&pd->rt.fv, nil);  // full memory barrier between store to rd and load of rg in netpollunblock
		rg = netpollunblock(pd, 'r', false);
	}
	if(write) {
		if(pd->wd <= 0 || (pd->wt.fv == nil && !read))
			runtime·throw("deadlineimpl: inconsistent write deadline");
		pd->wd = -1;
		runtime·atomicstorep(&pd->wt.fv, nil);  // full memory barrier between store to wd and load of wg in netpollunblock
		wg = netpollunblock(pd, 'w', false);
	}
	runtime·unlock(pd);
	if(rg)
		runtime·ready(rg);
	if(wg)
		runtime·ready(wg);
}

初始化操作

  • epoll init
///runtime/netpoll_epoll.c
static int32 epfd = -1;  // epoll descriptor

void
runtime·netpollinit(void)
{
	epfd = runtime·epollcreate1(EPOLL_CLOEXEC);
	if(epfd >= 0)
		return;
	epfd = runtime·epollcreate(1024);
	if(epfd >= 0) {
		runtime·closeonexec(epfd);
		return;
	}
	runtime·printf("netpollinit: failed to create descriptor (%d)\n", -epfd);
	runtime·throw("netpollinit: failed to create descriptor");
}
  • netFD init

runtime创建socket fd后,会将其加入到epoll:

//net/fd_unix.go
func (fd *netFD) init() error {
	if err := fd.pd.Init(fd); err != nil {
		return err
	}
	return nil
}

//net/fd_poll_runtime.go
func (pd *pollDesc) Init(fd *netFD) error {
	serverInit.Do(runtime_pollServerInit)
	ctx, errno := runtime_pollOpen(uintptr(fd.sysfd)) ///add to epoll
	if errno != 0 {
		return syscall.Errno(errno)
	}
	pd.runtimeCtx = ctx
	return nil
}

//runtime/netpoll.goc
func runtime_pollOpen(fd uintptr) (pd *PollDesc, errno int) {
	pd = allocPollDesc();
	runtime·lock(pd);
	if(pd->wg != nil && pd->wg != READY)
		runtime·throw("runtime_pollOpen: blocked write on free descriptor");
	if(pd->rg != nil && pd->rg != READY)
		runtime·throw("runtime_pollOpen: blocked read on free descriptor");
	pd->fd = fd;
	pd->closing = false;
	pd->seq++;
	pd->rg = nil;
	pd->rd = 0;
	pd->wg = nil;
	pd->wd = 0;
	runtime·unlock(pd);

	errno = runtime·netpollopen(fd, pd);
}

int32
runtime·netpollopen(uintptr fd, PollDesc *pd)
{
	EpollEvent ev;
	int32 res;

	ev.events = EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET;
	ev.data = (uint64)pd;
	res = runtime·epollctl(epfd, EPOLL_CTL_ADD, (int32)fd, &ev);
	return -res;
}

总结

SetReadDeadline用于实现Go的网络IO超时原语,它会给netFD创建对应的IO定时器,当定时器超时,runtime会调用runtime·ready唤醒对应进行Read/Write的goroutine,如果对应的goroutine处于等待的状态(默认情况下deadline为0,不会创建定时器)。

Go并没有使用epoll_wait实现IO的超时,而是通过Set[Read|Write]Deadline(time.Time)对每个netFD设置超时。

SetDeadline设置的定时器超时后,在超时处理函数中,会删除该定时器;而且,每次收到或者发送数据时,也不会reset该定时器。所以,每次Read/Write操作之前,都需要调用该函数。

Reference

]]>