YY哥 2017-11-09T08:26:43+00:00 hustcat@gmail.com Understanding the RoCE network protocol 2017-11-09T15:20:30+00:00 hustcat http://hustcat.github.io/roce-protocol RoCERDMA over Converged Ethernet的简称,基于它可以在以太网上实现RDMA.另外一种方式是RDMA over an InfiniBand.所以RoCE(严格来说是RoCEv1)是一个与InfiniBand相对应的链路层协议。

There are two RoCE versions, RoCE v1 and RoCE v2. RoCE v1 is an Ethernet link layer protocol and hence allows communication between any two hosts in the same Ethernet broadcast domain. RoCE v2 is an internet layer protocol which means that RoCE v2 packets can be routed.


对于RoCE互联网络,硬件方面需要支持IEEE DCB的L2以太网交换机,计算节点需要支持RoCE的网卡:

On the hardware side, basically you need an L2 Ethernet switch with IEEE DCB (Data Center Bridging, aka Converged Enhanced Ethernet) with support for priority flow control.

 On the compute or storage server end, you need an RoCE-capable network adapter.


对应的协议规范参考InfiniBand™ Architecture Specification Release 1.2.1 Annex A16: RoCE



由于RoCEv1的数据帧不带IP头部,所以只能在L2子网内通信。所以RoCEv2扩展了RoCEv1,将GRH(Global Routing Header)换成UDP header + IP header:

RoCEv2 is a straightforward extension of the RoCE protocol that involves a simple modification of the RoCE packet format.

Instead of the GRH, RoCEv2 packets carry an IP header which allows traversal of IP L3 Routers and a UDP header that serves as a stateless encapsulation layer for the RDMA Transport Protocol Packets over IP.





Linux Soft-RoCE implementation 2017-11-08T23:20:30+00:00 hustcat http://hustcat.github.io/linux-soft-roce-implementation 内核在4.9实现的Soft-RoCE实现了RoCEv2.


libRXE (user space library)

|--- rxe_create_qp
    |--- ibv_cmd_create_qp
  • ibv_create_qp
LATEST_SYMVER_FUNC(ibv_create_qp, 1_1, "IBVERBS_1.1",
		   struct ibv_qp *,
		   struct ibv_pd *pd,
		   struct ibv_qp_init_attr *qp_init_attr)
	struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr); ///rxe_ctx_ops
  • rxe_create_qp
static struct ibv_qp *rxe_create_qp(struct ibv_pd *pd,
				    struct ibv_qp_init_attr *attr)
	struct ibv_create_qp cmd;
	struct rxe_create_qp_resp resp;
	struct rxe_qp *qp;
	int ret;
	ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof cmd,
				&resp.ibv_resp, sizeof resp); /// ibv_create_qp CMD, to kernel
	qp->sq.max_sge = attr->cap.max_send_sge;
	qp->sq.max_inline = attr->cap.max_inline_data;
	qp->sq.queue = mmap(NULL, resp.sq_mi.size, PROT_READ | PROT_WRITE,
			    pd->context->cmd_fd, resp.sq_mi.offset); ///mmap,参考rxe_mmap




static const struct file_operations uverbs_fops = {
	.owner	 = THIS_MODULE,
	.write	 = ib_uverbs_write,
	.open	 = ib_uverbs_open,
	.release = ib_uverbs_close,
	.llseek	 = no_llseek,
  • ibv_open_device
LATEST_SYMVER_FUNC(ibv_open_device, 1_1, "IBVERBS_1.1",
		   struct ibv_context *,
		   struct ibv_device *device)
	struct verbs_device *verbs_device = verbs_get_device(device);
	char *devpath;
	int cmd_fd, ret;
	struct ibv_context *context;
	struct verbs_context *context_ex;

	if (asprintf(&devpath, "/dev/infiniband/%s", device->dev_name) < 0)
		return NULL;

	 * We'll only be doing writes, but we need O_RDWR in case the
	 * provider needs to mmap() the file.
	cmd_fd = open(devpath, O_RDWR | O_CLOEXEC); /// /dev/infiniband/uverbs0

	if (cmd_fd < 0)
		return NULL;

	if (!verbs_device->ops->init_context) {
		context = verbs_device->ops->alloc_context(device, cmd_fd); ///rxe_alloc_context, rxe_dev_ops
		if (!context)
			goto err;
	context->device = device;
	context->cmd_fd = cmd_fd;
	pthread_mutex_init(&context->mutex, NULL);


	return context;

kernel (rdma_rxe module)

  • ib_uverbs_create_qp


|--- ib_uverbs_create_qp
     |--- create_qp
	      |--- ib_device->create_qp
		       |--- rxe_create_qp

create_qp调用ib_device->create_qp,对于RXE, 为函数rxe_create_qp, 参考rxe_register_device.

  • rxe_create_qp
|--- rxe_qp_from_init
     |--- rxe_qp_init_req


  • rxe_qp_init_req


创建对应的UDP socket



  • rxe_queue_init


struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
				 int *num_elem,
				 unsigned int elem_size)
	struct rxe_queue *q;
	size_t buf_size;
	unsigned int num_slots;
	buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size;

	q->buf = vmalloc_user(buf_size);

rxe_queue->buf指向的内存缓冲区,由rxe_mmap映射到用户空间,队列的element对应数据结构struct rxe_send_wqe.

libiverbs API调用ibv_post_send时,会将对应的struct rxe_send_wqe加入到该队列,参考rdma-core@post_one_send.

  • rxe_mmap
 * rxe_mmap - create a new mmap region
 * @context: the IB user context of the process making the mmap() call
 * @vma: the VMA to be initialized
 * Return zero if the mmap is OK. Otherwise, return an errno.
int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
	struct rxe_dev *rxe = to_rdev(context->device);
	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
	unsigned long size = vma->vm_end - vma->vm_start;
	struct rxe_mmap_info *ip, *pp;

	ret = remap_vmalloc_range(vma, ip->obj, 0);
	if (ret) {
		pr_err("rxe: err %d from remap_vmalloc_range\n", ret);
		goto done;

	vma->vm_ops = &rxe_vm_ops;
	vma->vm_private_data = ip;



rxe_post_send会将struct ibv_send_wr转成struct rxe_send_wqe,并加入到发送队列rxe_qp->rq,然后通过cmd_fd给RXE内核模块发送IB_USER_VERBS_CMD_POST_SEND命令:

/* this API does not make a distinction between
   restartable and non-restartable errors */
static int rxe_post_send(struct ibv_qp *ibqp,
			 struct ibv_send_wr *wr_list,
			 struct ibv_send_wr **bad_wr)
	int rc = 0;
	int err;
	struct rxe_qp *qp = to_rqp(ibqp);/// ibv_qp -> rxe_qp
	struct rxe_wq *sq = &qp->sq;

	if (!bad_wr)
		return EINVAL;

	*bad_wr = NULL;

	if (!sq || !wr_list || !sq->queue)
	 	return EINVAL;


	while (wr_list) {
		rc = post_one_send(qp, sq, wr_list); /// ibv_send_wr -> rxe_send_wqe, enqueue
		if (rc) {
			*bad_wr = wr_list;

		wr_list = wr_list->next;


	err =  post_send_db(ibqp); /// IB_USER_VERBS_CMD_POST_SEND cmd
	return err ? err : rc;



ib_uverbs_post_send -> ib_device->post_send -> rxe_post_send -> rxe_requester -> ip_local_out

  • rxe_post_send
static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
			 struct ib_send_wr **bad_wr)
	int err = 0;
	struct rxe_qp *qp = to_rqp(ibqp); ///ib_qp -> rxe_qp
	 * Must sched in case of GSI QP because ib_send_mad() hold irq lock,
	 * and the requester call ip_local_out_sk() that takes spin_lock_bh.
	must_sched = (qp_type(qp) == IB_QPT_GSI) ||
			(queue_count(qp->sq.queue) > 1);

	rxe_run_task(&qp->req.task, must_sched); /// to rxe_requester

	return err;
  • rxe_requester


int rxe_requester(void *arg)
	struct rxe_qp *qp = (struct rxe_qp *)arg;
	struct rxe_pkt_info pkt;
	struct sk_buff *skb;
	struct rxe_send_wqe *wqe;
	wqe = req_next_wqe(qp); /// get rxe_send_wqe
	/// rxe_send_wqe -> skb
	skb = init_req_packet(qp, wqe, opcode, payload, &pkt);
	ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);

static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
				  struct rxe_pkt_info *pkt, struct sk_buff *skb)
	if (pkt->mask & RXE_LOOPBACK_MASK) {
		memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
		err = rxe->ifc_ops->loopback(skb);
	} else {
		err = rxe->ifc_ops->send(rxe, pkt, skb);/// ifc_ops->send, send



RDMA Programming - Base on linux-rdma 2017-11-08T23:00:30+00:00 hustcat http://hustcat.github.io/rdma-programming linux-rdma为Linux内核Infiniband子系统drivers/infiniband对应的用户态库,提供了Infiniband Verbs APIRDMA Verbs API.


  • Queue Pair(QP)

为了进行RDMA操作,需要在两端建立连接,这通过Queue Pair (QP)来完成,QP相当于socket。通信的两端都需要进行QP的初始化,Communication Manager (CM) 在双方真正建立连接前交换QP信息。

Once a QP is established, the verbs API can be used to perform RDMA reads, RDMA writes, and atomic operations. Serialized send/receive operations, which are similar to socket reads/writes, can be performed as well.

QP对应数据结构struct ibv_qpibv_create_qp用于创建QP.

 * ibv_create_qp - Create a queue pair.
struct ibv_qp *ibv_create_qp(struct ibv_pd *pd,
			     struct ibv_qp_init_attr *qp_init_attr);
  • Completion Queue(CQ)

A Completion Queue is an object which contains the completed work requests which were posted to the Work Queues (WQ). Every completion says that a specific WR was completed (both successfully completed WRs and unsuccessfully completed WRs). A Completion Queue is a mechanism to notify the application about information of ended Work Requests (status, opcode, size, source).

对应数据结构struct ibv_cq. ibv_create_cq用于创建CQ:

 * ibv_create_cq - Create a completion queue
 * @context - Context CQ will be attached to
 * @cqe - Minimum number of entries required for CQ
 * @cq_context - Consumer-supplied context returned for completion events
 * @channel - Completion channel where completion events will be queued.
 *     May be NULL if completion events will not be used.
 * @comp_vector - Completion vector used to signal completion events.
 *     Must be >= 0 and < context->num_comp_vectors.
struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe,
			     void *cq_context,
			     struct ibv_comp_channel *channel,
			     int comp_vector);
  • Memory Registration (MR)

Memory Registration is a mechanism that allows an application to describe a set of virtually con- tiguous memory locations or a set of physically contiguous memory locations to the network adapter as a virtually contiguous buffer using Virtual Addresses.

对应数据结构struct ibv_mr:

struct ibv_mr {
	struct ibv_context     *context;
	struct ibv_pd	       *pd;
	void		       *addr;
	size_t			length;
	uint32_t		handle;
	uint32_t		lkey;
	uint32_t		rkey;

Every MR has a remote and a local key (rkey, lkey).

Local keys are used by the local HCA to access local memory, such as during a receive operation.

Remote keys are given to the remote HCA to allow a remote process access to system memory during RDMA operations.

ibv_reg_mr registers a memory region (MR), associates it with a protection domain (PD), and assigns it local and remote keys (lkey, rkey).

 * ibv_reg_mr - Register a memory region
struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr,
			  size_t length, int access);
  • Protection Domain (PD)

Object whose components can interact with only each other. These components can be AH, QP, MR, and SRQ.

A protection domain is used to associate Queue Pairs with Memory Regions and Memory Windows , as a means for enabling and controlling network adapter access to Host System memory.

struct ibv_pd is used to implement protection domains:

struct ibv_pd {
	struct ibv_context     *context;
	uint32_t		handle;

ibv_alloc_pd creates a protection domain (PD). PDs limit which memory regions can be accessed by which queue pairs (QP) providing a degree of protection from unauthorized access.

 * ibv_alloc_pd - Allocate a protection domain
struct ibv_pd *ibv_alloc_pd(struct ibv_context *context);
  • Send Request (SR)

An SR defines how much data will be sent, from where, how and, with RDMA, to where. struct ibv_send_wr is used to implement SRs.参考struct ibv_send_wr

示例(IB Verbs API example)

RDMA应用可以使用librdmacm或者libibverbs API编程。前者是对后者的进一步封装。

rc_pingpong是直接使用libibverbs API编程的示例。

一般来说,使用IB Verbs API的基本流程如下:

  • (1) Get the device list

First you must retrieve the list of available IB devices on the local host. Every device in this list contains both a name and a GUID. For example the device names can be: mthca0, mlx4_1.参考这里.

IB devices对应数据结构struct ibv_device:

struct ibv_device {
	struct _ibv_device_ops	_ops;
	enum ibv_node_type	node_type;
	enum ibv_transport_type	transport_type;
	/* Name of underlying kernel IB device, eg "mthca0" */
	char			name[IBV_SYSFS_NAME_MAX];
	/* Name of uverbs device, eg "uverbs0" */
	char			dev_name[IBV_SYSFS_NAME_MAX];
	/* Path to infiniband_verbs class device in sysfs */
	char			dev_path[IBV_SYSFS_PATH_MAX];
	/* Path to infiniband class device in sysfs */
	char			ibdev_path[IBV_SYSFS_PATH_MAX];

应用程序通过API ibv_get_device_list获取IB设备列表:

 * ibv_get_device_list - Get list of IB devices currently available
 * @num_devices: optional.  if non-NULL, set to the number of devices
 * returned in the array.
 * Return a NULL-terminated array of IB devices.  The array can be
 * released with ibv_free_device_list().
struct ibv_device **ibv_get_device_list(int *num_devices);
  • (2) Open the requested device

Iterate over the device list, choose a device according to its GUID or name and open it.参考这里.


 * ibv_open_device - Initialize device for use
struct ibv_context *ibv_open_device(struct ibv_device *device);


struct ibv_context {
	struct ibv_device      *device;
	struct ibv_context_ops	ops;
	int			cmd_fd;
	int			async_fd;
	int			num_comp_vectors;
	pthread_mutex_t		mutex;
	void		       *abi_compat;
  • (3) Allocate a Protection Domain


A Protection Domain (PD) allows the user to restrict which components can interact with only each other.

These components can be AH, QP, MR, MW, and SRQ.

  • (4) Register a memory region


Any memory buffer which is valid in the process’s virtual space can be registered.

During the registration process the user sets memory permissions and receives local and remote keys (lkey/rkey) which will later be used to refer to this memory buffer.

  • (5) Create a Completion Queue(CQ)


A CQ contains completed work requests (WR). Each WR will generate a completion queue entry (CQE) that is placed on the CQ.

The CQE will specify if the WR was completed successfully or not.

  • (6) Create a Queue Pair(QP)


Creating a QP will also create an associated send queue and receive queue.

  • (7) Bring up a QP


A created QP still cannot be used until it is transitioned through several states, eventually getting to Ready To Send (RTS).

This provides needed information used by the QP to be able send / receive data.


 * ibv_modify_qp - Modify a queue pair.
int ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
		  int attr_mask);



RESET               Newly created, queues empty.
INIT                Basic information set. Ready for posting to receive queue.
RTR Ready to Receive. Remote address info set for connected QPs, QP may now receive packets.
RTS Ready to Send. Timeout and retry parameters set, QP may now send packets.
  • (8) Post work requests and poll for completion

Use the created QP for communication operations.


  • (9) Cleanup
Destroy objects in the reverse order you created them:
Delete QP
Delete CQ
Deregister MR
Deallocate PD
Close device


  • server
# ibv_rc_pingpong -d rxe0 -g 0 -s 128 -r 1 -n 1
  local address:  LID 0x0000, QPN 0x000011, PSN 0x626753, GID fe80::5054:61ff:fe57:1211
  remote address: LID 0x0000, QPN 0x000011, PSN 0x849753, GID fe80::5054:61ff:fe56:1211
256 bytes in 0.00 seconds = 11.38 Mbit/sec
1 iters in 0.00 seconds = 180.00 usec/iter
  • client
# ibv_rc_pingpong -d rxe0 -g 0 -s 128 -r 1 -n 1
  local address:  LID 0x0000, QPN 0x000011, PSN 0x849753, GID fe80::5054:61ff:fe56:1211
  remote address: LID 0x0000, QPN 0x000011, PSN 0x626753, GID fe80::5054:61ff:fe57:1211
256 bytes in 0.00 seconds = 16.13 Mbit/sec
1 iters in 0.00 seconds = 127.00 usec/iter


其中,第一个RC Send only为client发送给server的包,参考这里. 然后server回了一个RC Ack,并给client发送了一个RC Send only,参考这里.



Multiple queue and RSS in DPDK 2017-10-17T11:00:30+00:00 hustcat http://hustcat.github.io/rss-in-dpdk RX queue


struct rte_eth_dev_data {
	char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */

	void **rx_queues; /**< Array of pointers to RX queues. */
	void **tx_queues; /**< Array of pointers to TX queues. */
	uint16_t nb_rx_queues; /**< Number of RX queues. */
	uint16_t nb_tx_queues; /**< Number of TX queues. */


 * Structure associated with each RX queue.
struct igb_rx_queue {
	struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
	volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
	uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
	volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
	volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
	struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
	struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
	struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
	uint16_t            nb_rx_desc; /**< number of RX descriptors. */
	uint16_t            rx_tail;    /**< current value of RDT register. */
	uint16_t            nb_rx_hold; /**< number of held free RX desc. */
	uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
	uint16_t            queue_id;   /**< RX queue index. */
	uint16_t            reg_idx;    /**< RX queue register index. */
	uint8_t             port_id;    /**< Device port identifier. */
	uint8_t             pthresh;    /**< Prefetch threshold register. */
	uint8_t             hthresh;    /**< Host threshold register. */
	uint8_t             wthresh;    /**< Write-back threshold register. */
	uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */


  • e1000_adv_rx_desc


/* Receive Descriptor - Advanced */
union e1000_adv_rx_desc {
	struct {
		__le64 pkt_addr; /* Packet buffer address */
		__le64 hdr_addr; /* Header buffer address */
	} read; ///for receive
	struct {
		struct {
			union {
				__le32 data;
				struct {
					__le16 pkt_info; /*RSS type, Pkt type*/
					/* Split Header, header buffer len */
					__le16 hdr_info;
				} hs_rss;
			} lo_dword;
			union {
				__le32 rss; /* RSS Hash */
				struct {
					__le16 ip_id; /* IP id */
					__le16 csum; /* Packet Checksum */
				} csum_ip;
			} hi_dword;
		} lower;
		struct {
			__le32 status_error; /* ext status/error */
			__le16 length; /* Packet length */
			__le16 vlan; /* VLAN tag */
		} upper;
	} wb;  /* writeback */
  • igb_rx_entry


 * Structure associated with each descriptor of the RX ring of a RX queue.
struct igb_rx_entry {
	struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */

 * The generic rte_mbuf, containing a packet mbuf.
struct rte_mbuf {
	MARKER cacheline0;

	void *buf_addr;           /**< Virtual address of segment buffer. */
	 * Physical address of segment buffer.
	 * Force alignment to 8-bytes, so as to ensure we have the exact
	 * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
	 * working on vector drivers easier.
	phys_addr_t buf_physaddr __rte_aligned(sizeof(phys_addr_t));

Config queue


		ret = rte_eth_dev_configure(portid, nb_rx_queue,
					(uint16_t)n_tx_queue, &port_conf);


  • config rx queue
static int
rte_eth_dev_rx_queue_config(struct rte_eth_dev *dev, uint16_t nb_queues)
	uint16_t old_nb_queues = dev->data->nb_rx_queues;
	void **rxq;
	unsigned i;

	if (dev->data->rx_queues == NULL && nb_queues != 0) { /* first time configuration */
		dev->data->rx_queues = rte_zmalloc("ethdev->rx_queues",
				sizeof(dev->data->rx_queues[0]) * nb_queues,
		if (dev->data->rx_queues == NULL) {
			dev->data->nb_rx_queues = 0;
			return -(ENOMEM);

Setup queue

  • rte_eth_rx_queue_setup

DPDK application都会调用rte_eth_rx_queue_setup初始化接收队列。

rte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,
		       uint16_t nb_rx_desc, unsigned int socket_id,
		       const struct rte_eth_rxconf *rx_conf,
		       struct rte_mempool *mp)
	ret = (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
					      socket_id, rx_conf, mp); ///eth_igb_ops, eth_igb_rx_queue_setup

eth_igb_rx_queue_setup会创建接收队列igb_rx_queue,分配RX ring hardware descriptors(e1000_adv_rx_desc)software ring(igb_rx_entry):

eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
			 uint16_t queue_idx,
			 uint16_t nb_desc,
			 unsigned int socket_id,
			 const struct rte_eth_rxconf *rx_conf,
			 struct rte_mempool *mp)
	const struct rte_memzone *rz;
	struct igb_rx_queue *rxq;
	struct e1000_hw     *hw;
	unsigned int size;

	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
	/* First allocate the RX queue data structure. */
	rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
	 *  Allocate RX ring hardware descriptors. A memzone large enough to
	 *  handle the maximum ring size is allocated in order to allow for
	 *  resizing in later calls to the queue setup function.
	size = sizeof(union e1000_adv_rx_desc) * E1000_MAX_RING_DESC;
	rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
				      E1000_ALIGN, socket_id);
	rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
	rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
	rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
	rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;

	/* Allocate software ring. */
	rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
				   sizeof(struct igb_rx_entry) * nb_desc,



  • Configure RSS with DPDK

通过rx_mode.mq_mode = ETH_MQ_RX_RSSrte_eth_dev_configure)可以开启Port的RSS,以l3fwd为例:

static struct rte_eth_conf port_conf = {
	.rxmode = {
		.mq_mode = ETH_MQ_RX_RSS,
		.max_rx_pkt_len = ETHER_MAX_LEN,
		.split_hdr_size = 0,
		.header_split   = 0, /**< Header Split disabled */
		.hw_ip_checksum = 1, /**< IP checksum offload enabled */
		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
	.rx_adv_conf = {
		.rss_conf = {
			.rss_key = NULL,
			.rss_hf = ETH_RSS_IP,
	.txmode = {
		.mq_mode = ETH_MQ_TX_NONE,
  • Driver(igb) config RSS

eth_igb_start -> eth_igb_rx_init -> igb_dev_mq_rx_configure

static int
igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
	struct e1000_hw *hw =
	uint32_t mrqc;

	if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
		 * SRIOV active scheme
		 * FIXME if support RSS together with VMDq & SRIOV
		mrqc = E1000_MRQC_ENABLE_VMDQ;
		/* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
		mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
		E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
	} else if(RTE_ETH_DEV_SRIOV(dev).active == 0) { ///disable SRIOV
		 * SRIOV inactive scheme
		switch (dev->data->dev_conf.rxmode.mq_mode) {
			case ETH_MQ_RX_RSS:
				igb_rss_configure(dev); ///RSS

static void
igb_rss_configure(struct rte_eth_dev *dev)
	if (rss_conf.rss_key == NULL)
		rss_conf.rss_key = rss_intel_key; /* Default hash key */
	igb_hw_rss_hash_set(hw, &rss_conf);


KNI in DPDK 2017-10-11T23:00:30+00:00 hustcat http://hustcat.github.io/kni-in-dpdk 介绍

The Kernel NIC Interface (KNI) is a DPDK control plane solution that allows userspace applications to exchange packets with the kernel networking stack. To accomplish this, DPDK userspace applications use an IOCTL call to request the creation of a KNI virtual device in the Linux* kernel. The IOCTL call provides interface information and the DPDK’s physical address space, which is re-mapped into the kernel address space by the KNI kernel loadable module that saves the information to a virtual device context. The DPDK creates FIFO queues for packet ingress and egress to the kernel module for each device allocated.

The KNI kernel loadable module is a standard net driver, which upon receiving the IOCTL call access the DPDK’s FIFO queue to receive/transmit packets from/to the DPDK userspace application. The FIFO queues contain pointers to data packets in the DPDK. This:

  • Provides a faster mechanism to interface with the kernel net stack and eliminates system calls

  • Facilitates the DPDK using standard Linux* userspace net tools (tcpdump, ftp, and so on)

  • Eliminate the copy_to_user and copy_from_user operations on packets.


Load KNI kernel module:

# insmod /root/dpdk/x86/lib/modules/3.10.0-514.el7.x86_64/extra/dpdk/rte_kni.ko

Build KNI application:

# export RTE_SDK=/root/dpdk/x86/share/dpdk
# cd examples/kni
# make
  CC main.o
  LD kni
  INSTALL-MAP kni.map

Run KNI application:

# build/kni -c 0x0f -n 2 -- -P -p 0x3 --config="(0,0,1),(1,2,3)" 
EAL: Detected 4 lcore(s)
EAL: No free hugepages reported in hugepages-1048576kB
EAL: Probing VFIO support...
EAL: WARNING: cpu flags constant_tsc=yes nonstop_tsc=no -> using unreliable clock cycles !
EAL: PCI device 0000:00:05.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:06.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:07.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
APP: Initialising port 0 ...
KNI: pci: 00:06:00       8086:100e
APP: Initialising port 1 ...
KNI: pci: 00:07:00       8086:100e

Checking link status
Port 0 Link Up - speed 1000 Mbps - full-duplex
Port 1 Link Up - speed 1000 Mbps - full-duplex
APP: Lcore 1 is writing to port 0
APP: Lcore 2 is reading from port 1
APP: Lcore 0 is reading from port 0
APP: Lcore 3 is writing to port 1


  • -c = core bitmask

  • -P = promiscuous mode

  • -p = port hex bitmask

  • –config=”(port, lcore_rx, lcore_tx [,lcore_kthread, …]) …”

Note that each core can do either TX or RX for one port only.

[root@vm01 ~]# ip a
7: vEth0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
    link/ether ba:92:66:e5:2f:35 brd ff:ff:ff:ff:ff:ff
8: vEth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
    link/ether b2:64:67:2f:32:4a brd ff:ff:ff:ff:ff:ff
[root@vm01 ~]# ip addr add dev vEth0
[root@vm01 ~]# ip link set vEth0 up
[root@vm03 ~]# ping -c 3 
PING ( 56(84) bytes of data.
64 bytes from icmp_seq=1 ttl=64 time=14.2 ms
64 bytes from icmp_seq=2 ttl=64 time=2.96 ms
64 bytes from icmp_seq=3 ttl=64 time=1.89 ms


[root@vm01 ~]# pkill -10 kni

**KNI example application statistics**
======  ==============  ============  ============  ============  ============
 Port    Lcore(RX/TX)    rx_packets    rx_dropped    tx_packets    tx_dropped
------  --------------  ------------  ------------  ------------  ------------
      0          0/ 1            23             0             5             0
      1          2/ 3             1             0             0             0
======  ==============  ============  ============  ============  ============



KNI示例程序位于example/kni,KNI内核模块位于lib/librte_eal/linuxapp/kni,KNI library位于lib/librte_kni




		/* Burst rx from eth */
		nb_rx = rte_eth_rx_burst(port_id, 0, pkts_burst, PKT_BURST_SZ);

		/* Burst tx to kni */
		num = rte_kni_tx_burst(p->kni[i], pkts_burst, nb_rx);


rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
	void *phy_mbufs[num];
	unsigned int ret;
	unsigned int i;

	for (i = 0; i < num; i++)
		phy_mbufs[i] = va2pa(mbufs[i]);

	ret = kni_fifo_put(kni->rx_q, phy_mbufs, num);

	/* Get mbufs from free_q and then free them */

	return ret;

 * Adds num elements into the fifo. Return the number actually written
static inline unsigned
kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
	unsigned i = 0;
	unsigned fifo_write = fifo->write;
	unsigned fifo_read = fifo->read;
	unsigned new_write = fifo_write;

	for (i = 0; i < num; i++) {
		new_write = (new_write + 1) & (fifo->len - 1);

		if (new_write == fifo_read)
		fifo->buffer[fifo_write] = data[i];
		fifo_write = new_write;
	fifo->write = fifo_write;
	return i;
  • fifo


 * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
 * Write and read should wrap around. Fifo is empty when write == read
 * Writing should never overwrite the read position
struct rte_kni_fifo {
	volatile unsigned write;     /**< Next position to be written*/
	volatile unsigned read;      /**< Next position to be read */
	unsigned len;                /**< Circular buffer length */
	unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
	void *volatile buffer[];     /**< The buffer contains mbuf pointers */


struct rte_kni *
rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
	      const struct rte_kni_conf *conf,
	      struct rte_kni_ops *ops)
	/* TX RING */
	mz = slot->m_tx_q;
	ctx->tx_q = mz->addr;
	kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
	dev_info.tx_phys = mz->phys_addr;

	/* RX RING */
	mz = slot->m_rx_q;
	ctx->rx_q = mz->addr;
	kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
	dev_info.rx_phys = mz->phys_addr;
	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info); ///内核模块

KNI kernel module

static int
kni_ioctl(struct inode *inode, uint32_t ioctl_num, unsigned long ioctl_param)
		ret = kni_ioctl_create(net, ioctl_num, ioctl_param);


static int
kni_ioctl_create(struct net *net, uint32_t ioctl_num,
		unsigned long ioctl_param)
	net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name,
	/* Translate user space info into kernel space info */
	kni->tx_q = phys_to_virt(dev_info.tx_phys);
	kni->rx_q = phys_to_virt(dev_info.rx_phys);
	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
	kni->free_q = phys_to_virt(dev_info.free_phys);
	ret = kni_run_thread(knet, kni, dev_info.force_bind);

kernel thread:


static int
kni_thread_single(void *data)
	struct kni_net *knet = data;
	int j;
	struct kni_dev *dev;

	while (!kthread_should_stop()) {
		for (j = 0; j < KNI_RX_LOOP_NUM; j++) {
			list_for_each_entry(dev, &knet->kni_list_head, list) {

/* rx interface */
kni_net_rx(struct kni_dev *kni)
	 * It doesn't need to check if it is NULL pointer,
	 * as it has a default value
	(*kni_net_rx_func)(kni); ///kni_net_rx_normal
  • kni_net_rx_func


static void kni_net_rx_normal(struct kni_dev *kni);

/* kni rx function pointer, with default to normal rx */
static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 * RX: normal working mode
static void
kni_net_rx_normal(struct kni_dev *kni)
	/* Calculate the number of entries to dequeue from rx_q */
	num_rx = min_t(uint32_t, num_fq, MBUF_BURST_SZ);

	/* Burst dequeue from rx_q */
	num_rx = kni_fifo_get(kni->rx_q, kni->pa, num_rx);
	if (num_rx == 0)
	/* Transfer received packets to netif */
	for (i = 0; i < num_rx; i++) {
		kva = pa2kva(kni->pa[i]);
		len = kva->pkt_len;
		data_kva = kva2data_kva(kva);
		kni->va[i] = pa2va(kni->pa[i], kva);

		skb = dev_alloc_skb(len + 2);
		skb->dev = dev;
		skb->protocol = eth_type_trans(skb, dev);
		skb->ip_summed = CHECKSUM_UNNECESSARY;

		/* Call netif interface */
		netif_rx_ni(skb); ///进入内核协议栈


  • KNI kernel interface
 * Transmit a packet (called by the kernel)
static int
kni_net_tx(struct sk_buff *skb, struct net_device *dev)
	/* dequeue a mbuf from alloc_q */
	ret = kni_fifo_get(kni->alloc_q, &pkt_pa, 1);
	if (likely(ret == 1)) {
		void *data_kva;

		pkt_kva = pa2kva(pkt_pa);
		data_kva = kva2data_kva(pkt_kva);
		pkt_va = pa2va(pkt_pa, pkt_kva);

		len = skb->len; /// data length
		memcpy(data_kva, skb->data, len); /// copy data
		if (unlikely(len < ETH_ZLEN)) {
			memset(data_kva + len, 0, ETH_ZLEN - len);
			len = ETH_ZLEN;
		pkt_kva->pkt_len = len;
		pkt_kva->data_len = len;

		/* enqueue mbuf into tx_q */
		ret = kni_fifo_put(kni->tx_q, &pkt_va, 1);/// put tx_q
  • DPDK app
		/* Burst rx from kni */
		num = rte_kni_rx_burst(p->kni[i], pkts_burst, PKT_BURST_SZ);

		/* Burst tx to eth */
		nb_tx = rte_eth_tx_burst(port_id, 0, pkts_burst, (uint16_t)num);


rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
	unsigned ret = kni_fifo_get(kni->tx_q, (void **)mbufs, num);

	/* If buffers removed, allocate mbufs and then put them into alloc_q */
	if (ret)

	return ret;


Introduction to the UIO 2017-10-10T23:00:30+00:00 hustcat http://hustcat.github.io/introduction-to-uio UIO



直接read /dev/uioX来获取UIO设备的中断,read()会被阻塞,发生中断时就会返回。

Each UIO device is accessed through a device file and several sysfs attribute files. The device file will be called /dev/uio0 for the first device, and /dev/uio1, /dev/uio2 and so on for subsequent devices.

/dev/uioX is used to access the address space of the card. Just use mmap() to access registers or RAM locations of your card.

Interrupts are handled by reading from /dev/uioX. A blocking read() from /dev/uioX will return as soon as an interrupt occurs. You can also use select() on /dev/uioX to wait for an interrupt. The integer value read from /dev/uioX represents the total interrupt count. You can use this number to figure out if you missed some interrupts.

uio driver


UIO设备需要UIO内核驱动的支持,uio_pci_generic是一个通用的PCI UIO设备的内核驱动。

UIO does not completely eliminate the need for kernel-space code. A small module is required to set up the device, perhaps interface to the PCI bus, and register an interrupt handler. The last function (interrupt handling) is particularly important; much can be done in user space, but there needs to be an in-kernel interrupt handler which knows how to tell the device to stop crying for attention.

static struct pci_driver driver = {
	.name = "uio_pci_generic",
	.id_table = NULL, /* only dynamic id's */
	.probe = probe,
	.remove = remove,


uio_register_device 注册UIO驱动时时,会注册中断处理函数uio_interrupt:

 * uio_register_device - register a new userspace IO device
 * @owner:	module that creates the new device
 * @parent:	parent device
 * @info:	UIO device capabilities
 * returns zero on success or a negative error code.
int __uio_register_device(struct module *owner,
			  struct device *parent,
			  struct uio_info *info)
	if (info->irq && (info->irq != UIO_IRQ_CUSTOM)) {
		ret = request_irq(info->irq, uio_interrupt,
				  info->irq_flags, info->name, idev);
		if (ret)
			goto err_request_irq;

 * uio_interrupt - hardware interrupt handler
 * @irq: IRQ number, can be UIO_IRQ_CYCLIC for cyclic timer
 * @dev_id: Pointer to the devices uio_device structure
static irqreturn_t uio_interrupt(int irq, void *dev_id)
	struct uio_device *idev = (struct uio_device *)dev_id;
	irqreturn_t ret = idev->info->handler(irq, idev->info); ///irqhandler

	if (ret == IRQ_HANDLED)
		uio_event_notify(idev->info); ///notify userspace

	return ret;
  • uio_event_notify


/** notify userspace application
 * uio_event_notify - trigger an interrupt event
 * @info: UIO device capabilities
void uio_event_notify(struct uio_info *info)
	struct uio_device *idev = info->uio_dev;

	wake_up_interruptible(&idev->wait); ///wake up waiting process
	kill_fasync(&idev->async_queue, SIGIO, POLL_IN);
  • read /dev/uioX

每次调用open /dev/uioX时,内核都会创建一个uio_listener,关联到struct file->private_data:

static int uio_open(struct inode *inode, struct file *filep)
	struct uio_device *idev;
	struct uio_listener *listener;
	int ret = 0;

	idev = idr_find(&uio_idr, iminor(inode));
	listener = kmalloc(sizeof(*listener), GFP_KERNEL);
	if (!listener) {
		ret = -ENOMEM;
		goto err_alloc_listener;

	listener->dev = idev;
	listener->event_count = atomic_read(&idev->event);
	filep->private_data = listener;


static ssize_t uio_read(struct file *filep, char __user *buf,
			size_t count, loff_t *ppos)
	struct uio_listener *listener = filep->private_data;
	struct uio_device *idev = listener->dev;
	DECLARE_WAITQUEUE(wait, current);
	add_wait_queue(&idev->wait, &wait);

	do {

		event_count = atomic_read(&idev->event);
		if (event_count != listener->event_count) { ///irq happened
			if (copy_to_user(buf, &event_count, count))
				retval = -EFAULT;
			else {
				listener->event_count = event_count;
				retval = count;

		if (filep->f_flags & O_NONBLOCK) {
			retval = -EAGAIN;

		if (signal_pending(current)) {
			retval = -ERESTARTSYS;
	} while (1);

	remove_wait_queue(&idev->wait, &wait);

	return retval;
  • uio_mmap


static int uio_mmap(struct file *filep, struct vm_area_struct *vma)
	struct uio_listener *listener = filep->private_data;
	struct uio_device *idev = listener->dev;
	int mi;
	unsigned long requested_pages, actual_pages;
	int ret = 0;

	if (vma->vm_end < vma->vm_start)
		return -EINVAL;

	vma->vm_private_data = idev;

	mi = uio_find_mem_index(vma);
	if (mi < 0)
		return -EINVAL;

	requested_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
	actual_pages = ((idev->info->mem[mi].addr & ~PAGE_MASK)
			+ idev->info->mem[mi].size + PAGE_SIZE -1) >> PAGE_SHIFT;
	if (requested_pages > actual_pages)
		return -EINVAL;

	if (idev->info->mmap) {
		ret = idev->info->mmap(idev->info, vma);
		return ret;

	switch (idev->info->mem[mi].memtype) {
		case UIO_MEM_PHYS:
			return uio_mmap_physical(vma);
			return uio_mmap_logical(vma);
			return -EINVAL;

更多关于mmap参考Linux MMAP & Ioremap introduction.

userspace driver

fd = open(/dev/uio0, O_RDWR|O_SYNC);
/* Map device's registers into user memory */
/* fitting the memory area on pages */
offset = addr & ~PAGE_MASK;
addr = 0 /* region 0 */ * PAGE_SIZE;
size = (size + PAGE_SIZE ­ 1) / PAGE_SIZE * PAGE_SIZE;
iomem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, 
fd, addr);
iomem += offset;
/* Stop the counting */
*(u_char *)SH_TMU_TSTR(iomem) |= ~(TSTR_TSTR2);
/* Wait for an interrupt */;
read(fd, &n_pending, sizeof(u_long));
val = *(u_int *)SH_TMU2_TCNT(iomem);
/* Stop the TMU */
*(u_char *)SH_TMU_TSTR(iomem) &= ~(TSTR_TSTR2);
munmap(iomem, size);

详细参考 Using UIO in an embedded platform.


Checksum in Linux Kernel 2017-08-15T17:00:30+00:00 hustcat http://hustcat.github.io/checksum-in-kernel calculate IP/TCP/UDP checsum

内核计算IP/TCPU/UDP的校验和的方法,参考How to Calculate IP/TCP/UDP Checksum–Part 1 Theory.



static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
	const struct iphdr *iph = ip_hdr(skb);

	if (skb->ip_summed == CHECKSUM_COMPLETE) {
		if (!tcp_v4_check(skb->len, iph->saddr, ///check TCP/UDP pseudo-header checksum
				  iph->daddr, skb->csum)) {
			skb->ip_summed = CHECKSUM_UNNECESSARY;
			return 0;

	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
				       skb->len, IPPROTO_TCP, 0); ///calc pseudo header checksum

	if (skb->len <= 76) {
		return __skb_checksum_complete(skb); /// 基于伪头累加和,计算整个数据包的checksum
	return 0;




#define NETIF_F_IP_CSUM		__NETIF_F(IP_CSUM) ///ipv4 + TCP/UDP

NETIF_F_IP_CSUM表示硬件可以计算L4 checksum,但是只针对IPV4的TCP和UDP。但是一些设备扩展支持VXLAN和NVGRE。 NETIF_F_IP_CSUM是一种协议感知的计算checksum的方法。具体来说,上层提供两个CSUM的参数(csum_startcsum_offset)。

NETIF_F_HW_CSUM is a protocol agnostic method to offload the transmit checksum. In this method the host provides checksum related parameters in a transmit descriptor for a packet. These parameters include the starting offset of data to checksum and the offset in the packet where the computed checksum is to be written. The length of data to checksum is implicitly the length of the packet minus the starting offset.




 *	@csum: Checksum (must include start/offset pair)
 *	@csum_start: Offset from skb->head where checksumming should start
 *	@csum_offset: Offset from csum_start where checksum should be stored
 *	@ip_summed: Driver fed us an IP checksum
struct sk_buff {
	union {
		__wsum		csum;
		struct {
			__u16	csum_start;
			__u16	csum_offset;

	__u8			local_df:1,


/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_UNNECESSARY 1 ///hardware verified the checksums
#define CHECKSUM_PARTIAL 3 ///only compute IP header, not include data





igb_poll -> igb_clean_rx_irq -> igb_process_skb_fields -> igb_rx_checksum:

static inline void igb_rx_checksum(struct igb_ring *ring,
				   union e1000_adv_rx_desc *rx_desc,
				   struct sk_buff *skb)
	/* Rx checksum disabled via ethtool */
	if (!(ring->netdev->features & NETIF_F_RXCSUM)) ///关闭RXCSUM

	/* TCP/UDP checksum error bit is set */
	if (igb_test_staterr(rx_desc,
			     E1000_RXDEXT_STATERR_IPE)) {
		/* work around errata with sctp packets where the TCPE aka
		 * L4E bit is set incorrectly on 64 byte (60 byte w/o crc)
		 * packets, (aka let the stack check the crc32c)
		if (!((skb->len == 60) &&
		      test_bit(IGB_RING_FLAG_RX_SCTP_CSUM, &ring->flags))) {
		/* let the stack verify checksum errors,交给协议栈进一步验证csum */
	/* It must be a TCP or UDP packet with a valid checksum */
	if (igb_test_staterr(rx_desc, E1000_RXD_STAT_TCPCS |
		skb->ip_summed = CHECKSUM_UNNECESSARY; ///stack don't needed verify


int tcp_v4_rcv(struct sk_buff *skb)
	/* An explanation is required here, I think.
	 * Packet length and doff are validated by header prediction,
	 * provided case of th->doff==0 is eliminated.
	 * So, we defer the checks. */
	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
		goto csum_error;

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
	return skb->ip_summed & CHECKSUM_UNNECESSARY;


  • 设备不支持硬件校验和计算;

  • 设备计算了硬件校验和,但发现该数据帧已经损坏。此时,设备驱动程序可以直接丢弃该数据帧。但有些设备驱动程序(比如e10000/igb/ixbge)却没有丢弃数据帧,而是将ip_summed设置为CHECKSUM_NONE,然后交给上层协议栈重新计算并处理这种错误。



static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
	const struct iphdr *iph = ip_hdr(skb);

	if (skb->ip_summed == CHECKSUM_COMPLETE) {
		if (!tcp_v4_check(skb->len, iph->saddr, ///check TCP/UDP pseudo-header checksum
				  iph->daddr, skb->csum)) {
			skb->ip_summed = CHECKSUM_UNNECESSARY;
			return 0;



  • Veth的BUG

这里讨论一个有意思的问题:Linux kernel bug delivers corrupt TCP/IP data to Mesos, Kubernetes, Docker containers.


static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)

	/* don't change ip_summed == CHECKSUM_PARTIAL, as that
	 * will cause bad checksum on forwarded packets
	if (skb->ip_summed == CHECKSUM_NONE &&
	    rcv->features & NETIF_F_RXCSUM)
		skb->ip_summed = CHECKSUM_UNNECESSARY;







CHECKSUM_PARTIAL表示使用硬件checksum ,协议栈已经计算L4层的伪头的校验和,并且已经加入uh->check字段中,此时只需要设备计算整个头4层头的校验值。

int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
		size_t size)

				 * Check whether we can use HW checksum.
				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
					skb->ip_summed = CHECKSUM_PARTIAL;

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
	icsk->icsk_af_ops->send_check(sk, skb); ///tcp_v4_send_check

static void __tcp_v4_send_check(struct sk_buff *skb,
				__be32 saddr, __be32 daddr)
	struct tcphdr *th = tcp_hdr(skb);

	if (skb->ip_summed == CHECKSUM_PARTIAL) { ///HW CSUM
		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); ///add IPv4 pseudo header checksum
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct tcphdr, check);
	} else {
		th->check = tcp_v4_check(skb->len, saddr, daddr,
						      th->doff << 2,
						      skb->csum)); ///ip_summed == CHECKSUM_NONE

/* This routine computes an IPv4 TCP checksum. */
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
	const struct inet_sock *inet = inet_sk(sk);

	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
  • dev_queue_xmit


int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
			struct netdev_queue *txq)
		if (netif_needs_gso(skb, features)) {
			if (unlikely(dev_gso_segment(skb, features))) ///GSO(software offload)
				goto out_kfree_skb;
			if (skb->next)
				goto gso;
		} else { ///hardware offload
			if (skb_needs_linearize(skb, features) &&
				goto out_kfree_skb;

			/* If packet is not checksummed and device does not
			 * support checksumming for this protocol, complete
			 * checksumming here.
			if (skb->ip_summed == CHECKSUM_PARTIAL) { ///only header csum is computed
				if (skb->encapsulation)
				if (!(features & NETIF_F_ALL_CSUM) && ///check hardware if support offload
				     skb_checksum_help(skb)) ///HW not support CSUM
					goto out_kfree_skb;


Remote checksum



Mount namespace and mount propagation 2017-03-10T15:00:30+00:00 hustcat http://hustcat.github.io/mount-namespace-and-mount-propagation Mount namespace and problems

When a new mount namespace is created, it receives a copy of the mount point list replicated from the namespace of the caller of clone() or unshare().

create_new_namespaces -> copy_mnt_ns -> dup_mnt_ns:

 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
		struct fs_struct *fs)
	struct mnt_namespace *new_ns;
	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
	struct vfsmount *p, *q;

	new_ns = alloc_mnt_ns();
	if (IS_ERR(new_ns))
		return new_ns;

	/* First pass: copy the tree topology */
	new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root,
					CL_COPY_ALL | CL_EXPIRE); ///拷贝原来namespace所有的文件系统

每个Mount namespace有自己独立的文件系统视图,但是这种隔离性同时也带来一些问题:比如,当系统加载一块新的磁盘时,在最初的实现中,每个namespace必须单独挂载磁盘,才能见到。很多时候,我们希望挂载一次,就能在所有的mount namespace可见。为此,内核在2.6.15引入了shared subtrees feature

The key benefit of shared subtrees is to allow automatic, controlled propagation of mount and unmount events between namespaces. This means, for example, that mounting an optical disk in one mount namespace can trigger a mount of that disk in all other namespaces.

为了支持shared subtrees feature,每个挂载点都会标记propagation type,用于决定在当前挂载点下创建/删除(子)挂载点时,是否传播到别的挂载点。

propagation type



This mount point shares mount and unmount events with other mount points that are members of its “peer group”. When a mount point is added or removed under this mount point, this change will propagate to the peer group, so that the mount or unmount will also take place under each of the peer mount points. Propagation also occurs in the reverse direction, so that mount and unmount events on a peer mount will also propagate to this mount point.


This is the converse of a shared mount point. The mount point does not propagate events to any peers, and does not receive propagation events from any peers.


This propagation type sits midway between shared and private. A slave mount has a master—a shared peer group whose members propagate mount and unmount events to the slave mount. However, the slave mount does not propagate events to the master peer group.


This mount point is unbindable. Like a private mount point, this mount point does not propagate events to or from peers. In addition, this mount point can’t be the source for a bind mount operation.


(1) propagation type是对每个挂载点的设置.

(2) propagation type决定挂载点的直属(immediately under)子挂载点mount/umount事件的传播.比如,挂载点X下创建新的挂载点Y,Y会扩展到X的peer group,但是X不会影响Y下面的子挂载点。


Peer groups

peer group就是一些可以相互传播mount/umount事件的挂载点集合. 对于shared挂载点,当创建新的mount namspace或者作为bind mount的源目标时,就会创建新的成员。这两种情况都会创建新的挂载点,新的挂载点与原来的挂载点构成peer group。同理,当mount namespace释放时,或者挂载点umount时,会从对应的peer group删除。

A peer group is a set of mount points that propagate mount and unmount events to one another. A peer group acquires new members when a mount point whose propagation type is shared is either replicated during the creation of a new namespace or is used as the source for a bind mount. In both cases, the new mount point is made a member of the same peer group as the existing mount point. Conversely, a mount point ceases to be a member of a peer group when it is unmounted, either explicitly, or implicitly when a mount namespace is torn down because the last member process terminates or moves to another namespace.

  • 示例


sh1# mount --make-private / 
sh1# mount --make-shared /dev/sda3 /X 
sh1# mount --make-shared /dev/sda5 /Y 

然后在sh2执行:创建新的mount namespace:

sh2# unshare -m --propagation unchanged sh 

然后再在sh1执行:X bind mount to Z:

sh1# mkdir /Z 
sh1# mount --bind /X /Z 

这会创建2个peer group:

  • 第一个peer group包含X, X’, 和 Z。其中,X和X’是因为namespace的创建,X和Z是因为bind mount产生的。
  • 第二个peer group只包含Y, Y’。

注意,因为/private的,所以,bind mount Z并不会传播到第二个namespace。


// InitializeMountNamespace sets up the devices, mount points, and filesystems for use inside a
// new mount namespace.
func InitializeMountNamespace(rootfs, console string, sysReadonly bool, mountConfig *MountConfig) error {
	var (
		err  error
		flag = syscall.MS_PRIVATE

	if mountConfig.NoPivotRoot {
		flag = syscall.MS_SLAVE   ///容器中的mount事件不会传播到init ns

	if err := syscall.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil { ///将/设置为private,与init ns完全隔离
		return fmt.Errorf("mounting / with flags %X %s", (flag | syscall.MS_REC), err)

	if err := syscall.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
		return fmt.Errorf("mouting %s as bind %s", rootfs, err)


TCP SYN cookies make window size suddenly becomes smaller 2017-03-03T20:00:30+00:00 hustcat http://hustcat.github.io/tcp_syn_cookies_and_window_size 问题



经过和几位同事的一起各种定位,最终发现是TCP SYN cookies引起的。简单总结一下,以示后人。

TCP引入SYN cookies是为了解决SYN flood问题。

SYN cookie is a technique used to resist SYN flood attacks.The technique’s primary inventor Daniel J. Bernstein defines SYN cookies as “particular choices of initial TCP sequence numbers by TCP servers.” In particular, the use of SYN cookies allows a server to avoid dropping connections when the SYN queue fills up. Instead, the server behaves as if the SYN queue had been enlarged. The server sends back the appropriate SYN+ACK response to the client but discards the SYN queue entry. If the server then receives a subsequent ACK response from the client, the server is able to reconstruct the SYN queue entry using information encoded in the TCP sequence number.


  • sysctl_max_syn_backlog

sysctl_max_syn_backlog控制Listen Socket的半连接(SYN_RECV)队列长度:

struct inet_connection_sock {

	struct request_sock_queue icsk_accept_queue; ////SYN_RECV sockets queue

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
	struct inet_sock *inet = inet_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
	sk->sk_state = TCP_LISTEN;

int reqsk_queue_alloc(struct request_sock_queue *queue,
		      unsigned int nr_table_entries)
	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
	for (lopt->max_qlen_log = 3;
	     (1 << lopt->max_qlen_log) < nr_table_entries;

static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
	return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;

内核会根据listen的backlog和sysctl_max_syn_backlog计算listen socket的SYN queue的长度。如果队列满了,就会输出下面的日志:

TCP: TCP: Possible SYN flooding on port 6000. Dropping request.  Check SNMP counters.
  • sysctl_tcp_syncookies

控制是否启动TCP SYN cookies机制。

extern int sysctl_tcp_syncookies;


当接收端收到发送端的SYN包之后,会创建一个request_sock,再给发送端返回SYN/ACK包后,将request_sock加入到LISTEN socket的SYN table:

tcp_v4_do_rcv(TCP_LISTEN) -> tcp_rcv_state_process -> tcp_v4_conn_request:

///ipv4_specific, LISTEN socket handle SYN packet
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        struct request_sock *req;
	/* TW buckets are converted to open requests without
	 * limitations, they conserve resources and peer is
	 * evidently real one.
	if (inet_csk_reqsk_queue_is_full(sk) && !isn) { ///SYN queue is full
		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
		if (!want_cookie) ///no tcp_syncookies, drop SKB
			goto drop;

	/* Accept backlog is full. If we have already queued enough
	 * of warm entries in syn queue, drop request. It is better than
	 * clogging syn queue with openreqs with exponentially increasing
	 * timeout.
	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
		goto drop;
	req = inet_reqsk_alloc(&tcp_request_sock_ops);
	if (!req)
		goto drop;
	if (likely(!do_fastopen)) {
		int err;
		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, ///send SYN/ACK
		     ireq->rmt_addr, ireq->opt);
		err = net_xmit_eval(err);
		if (err || want_cookie) ///tcp_syncookies, don't add to SYN queue
			goto drop_and_free;

		tcp_rsk(req)->snt_synack = tcp_time_stamp;
		tcp_rsk(req)->listener = NULL;
		/* Add the request_sock to the SYN table */
		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); ///Add SYN table
		if (fastopen_cookie_present(&foc) && foc.len != 0)
	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) ///fast open
		goto drop_and_free;

当接收端再次收到发送端的ACK包时,内核会从SYN table找到与之对应的tcp_check_req,然后创建新的socket,至此,TCP连接算是完成建立(TCP_ESTABLISHED): tcp_v4_do_rcv(TCP_LISTEN) -> tcp_v4_hnd_req -> tcp_check_req:

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
	struct tcphdr *th = tcp_hdr(skb);
	const struct iphdr *iph = ip_hdr(skb);
	struct sock *nsk;
	struct request_sock **prev;
	/* Find possible connection requests. */
	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
						       iph->saddr, iph->daddr); ///get request_sock from SYN table
	if (req)
		return tcp_check_req(sk, skb, req, prev, false); /// create new socket

SYN cookies

在没有开启tcp_syncookies选项时,如果LISTEN socket的SYN queue满之后,会直接丢掉SKB:

///ipv4_specific, LISTEN socket handle SYN packet
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
	if (inet_csk_reqsk_queue_is_full(sk) && !isn) { ///SYN queue is full
		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
		if (!want_cookie) ///no tcp_syncookies, drop SKB
			goto drop;

开启tcp_syncookies之后,如果LISTEN socket的SYN queue满之后,会创建request_sock,再返给对端SYN/ACK后,并不会将request_sock对象加到SYN queue,而是将其释放:

	if (likely(!do_fastopen)) {
		int err;
		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, ///send SYN/ACK
		     ireq->rmt_addr, ireq->opt);
		err = net_xmit_eval(err);
		if (err || want_cookie) ///tcp_syncookies, don't add to SYN queue
			goto drop_and_free;

这样,当收到对端的ACK后,tcp_v4_hnd_req从SYN queue找不到对应的request_sock对象,就会进入syncookies的处理逻辑: tcp_v4_do_rcv -> tcp_v4_hnd_req:

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
	if (!th->syn)
		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
	return sk;


SYN cookies与TCP options

对于走SYN cookies逻辑的连接,由于内核没有保存相关socket的状态,所以,SYN包中携带的TCP options就会丢失。

  • MSS


struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
			     struct ip_options *opt)
	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
		goto out;

	if (tcp_synq_no_recent_overflow(sk) ||
	    (mss = cookie_check(skb, cookie)) == 0) { ///mss option value
		goto out;

	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
	if (!req)
		goto out;
	/* Try to redo what tcp_v4_send_synack did. */
	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
	///initial window
	tcp_select_initial_window(tcp_full_space(sk), req->mss,
				  &req->rcv_wnd, &req->window_clamp,
				  ireq->wscale_ok, &rcv_wscale,
				  dst_metric(&rt->dst, RTAX_INITRWND));

	ireq->rcv_wscale  = rcv_wscale;
  • wscale

但是,对于其它option,比如wscaleSACK等信息,就会丢失。后来,又使用timestamp来保存wscale,后来又取消了,参考12。详细参考Improving syncookies

对于TCP SYN cookies的处理逻辑,接收端在收到对端的ACK之后,会重新计算wscale,而不是TCP在建立连接的SYN/SYN-ACK过程协商的wscale,由于wscale的计算受recv buffer等参数的影响,会导致第二次计算的wscale与前面协商的不一致,从而导致发送端和接收端的wscale不一致:

void tcp_select_initial_window(int __space, __u32 mss,
			       __u32 *rcv_wnd, __u32 *window_clamp,
			       int wscale_ok, __u8 *rcv_wscale,
			       __u32 init_rcv_wnd)
	unsigned int space = (__space < 0 ? 0 : __space); ///sk_rcvbuf size
	(*rcv_wscale) = 0;
	if (wscale_ok) {
		/* Set window scaling on max possible window
		 * See RFC1323 for an explanation of the limit to 14
		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
		space = min_t(u32, space, *window_clamp);
		while (space > 65535 && (*rcv_wscale) < 14) {
			space >>= 1;



这本来是一个很简单的问题,但定位过程却走了不少弯路,从一开始就聚焦于TCP窗口机制,企图从中找问题,而忽略了内核的一些关键输出。再次说明了那个问题:* 表面上复杂的问题,背后的原因都非常简单!*

不管怎样,目前内核的TCP SYN cookies机制是有缺陷的,请慎用。


Dive deep into inotify and overlayfs 2017-01-06T11:00:30+00:00 hustcat http://hustcat.github.io/dive-into-inotify-and-overlayfs Introduction



  • inotifywait


#inotifywait -rme modify,open,create,delete,close /root/dbyin/test/
Setting up watches.  Beware: since -r was given, this may take a while!
Watches established.
/root/dbyin/test/ CREATE f1.txt
/root/dbyin/test/ OPEN f1.txt
/root/dbyin/test/ MODIFY f1.txt
/root/dbyin/test/ CLOSE_WRITE,CLOSE f1.txt
/root/dbyin/test/ DELETE f1.txt

Another terminal:

#echo hello > /root/dbyin/test/f1.txt
#rm /root/dbyin/test/f1.txt




  • fsnotify_group


 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
struct fsnotify_group {

	const struct fsnotify_ops *ops;	/* how this group handles things, inotify_fops */
	struct list_head notification_list;	/* list of event_holder this group needs to send to userspace, fsnotify_event list */
	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
	unsigned int q_len;			/* events on the queue */
	unsigned int max_events;		/* maximum events allowed on the list */

	struct list_head marks_list;	/* all inode marks for this group, struct fsnotify_mark list */

	struct fasync_struct    *fsn_fa;    /* async notification */

	/* groups can define private fields here or use the void *private */
	union {
		void *private;
		struct inotify_group_private_data {
			spinlock_t	idr_lock;
			struct idr      idr;   ///id -> inotify_inode_mark*
			struct user_struct      *user;
		} inotify_data; ///for inotify
  • fsnotify_mark


struct inotify_inode_mark {
	struct fsnotify_mark fsn_mark;
	int wd; ///watch descriptor

struct fsnotify_mark {
	__u32 mask;			/* mask this mark is for */
	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
	 * in kernel that found and may be using this mark. */
	atomic_t refcnt;		/* active things looking at this mark */
	struct fsnotify_group *group;	/* group this mark is for */
	struct list_head g_list;	/* list of marks by group->i_fsnotify_marks */
	spinlock_t lock;		/* protect group and inode */
	union {
		struct fsnotify_inode_mark i;
		struct fsnotify_vfsmount_mark m;
	__u32 ignored_mask;		/* events types to ignore */
	unsigned int flags;		/* vfsmount or inode mark? */
	struct list_head destroy_list;
	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */

 * Inode specific fields in an fsnotify_mark
struct fsnotify_inode_mark {
	struct inode *inode;		/* inode this mark is associated with */
	struct hlist_node i_list;	/* list of marks by inode->i_fsnotify_marks */
	struct list_head free_i_list;	/* tmp list used when freeing this mark */
  • inode and file
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
struct inode {

	__u32			i_fsnotify_mask; /* all events this inode cares about */
	struct hlist_head	i_fsnotify_marks; ///struct fsnotify_inode_mark list, see fsnotify_inode_mark.i_list

struct file {
	void			*private_data; ///fsnotify_group*


  • 数据结构


struct dentry {
	struct dentry *d_parent;	/* parent directory,父目录dentry对象 */
	struct qstr d_name;   ///当前分量的名称
	struct inode *d_inode;		/* inode对象, create by ovl_new_inode */
	const struct dentry_operations *d_op; /// == super_block->s_d_op == ovl_dentry_operations
	struct super_block *d_sb;	/* The root of the dentry tree */

	void *d_fsdata;			/* fs-specific data, struct ovl_entry */

/* private information held for every overlayfs dentry */
struct ovl_entry {
	struct dentry *__upperdentry; ///not NULL if got in upperdir
	struct ovl_dir_cache *cache;
	union {
		struct {
			u64 version;
			bool opaque;
		struct rcu_head rcu;
	unsigned numlower;
	struct path lowerstack[]; ///not NULL if got in lowdir

struct inode {
	const struct inode_operations	*i_op; ///ovl_dir_inode_operations
	struct super_block	*i_sb;
	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops, ovl_dir_operations */
	void			*i_private; /* fs or device private pointer,  struct ovl_entry*/

dentry是内核的目录项对象,每个目录(文件)都有一个对应的对象,对于overlayfs的每个dentry的指向的inode并没有实际的磁盘数据,而是由ovl_new_inode创建的一个内存inode;dentry->d_fsdata指向ovl_entry,而后者指向真正的underlay fs的dentry。

在overlayfs遍历时,dentry->inode并没有多大用,实际上,在ovl_lookup中,代表父目录的inode参数struct inode *dir并没有没使用到。而dentry->d_fsdata指向ovl_entry才是进行查找的关键因素,通过ovl_entry进入到underlay fs的查找。

///dir: parent directory inode object, dentry: dentry object for current finding dircotry entry
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
			  unsigned int flags) ///called by lookup_real
	struct ovl_entry *oe;
	struct ovl_entry *poe = dentry->d_parent->d_fsdata; ///dentry->d_parent->d_inode == dir
	struct path *stack = NULL;
	struct dentry *upperdir, *upperdentry = NULL;
	unsigned int ctr = 0;
	struct inode *inode = NULL;
	bool upperopaque = false;
	struct dentry *this, *prev = NULL;
	unsigned int i;
	int err;

	upperdir = ovl_upperdentry_dereference(poe);
	if (upperdir) { ///(1)lookup in upperdir firstly
		this = ovl_lookup_real(upperdir, &dentry->d_name);
		err = PTR_ERR(this);
		if (IS_ERR(this))
			goto out;

		if (this) {///exist in upperdir
			if (unlikely(ovl_dentry_remote(this))) {
				err = -EREMOTE;
				goto out;
			if (ovl_is_whiteout(this)) {
				dput(this); ///whiteout file
				this = NULL;
				upperopaque = true;
			} else if (poe->numlower && ovl_is_opaquedir(this)) {
				upperopaque = true; ///opaque dir
		upperdentry = prev = this;
	///(2)didn't find dentry in upperdir
	if (!upperopaque && poe->numlower) {
		err = -ENOMEM;
		stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
		if (!stack)
			goto out_put_upper;
	///(3)find dentry in lowdir
	for (i = 0; !upperopaque && i < poe->numlower; i++) {
		bool opaque = false;
		struct path lowerpath = poe->lowerstack[i];

		this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
		err = PTR_ERR(this);
		if (IS_ERR(this)) {
			 * If it's positive, then treat ENAMETOOLONG as ENOENT.
			if (err == -ENAMETOOLONG && (upperdentry || ctr))
			goto out_put;
		if (!this)
		if (ovl_is_whiteout(this)) {
		 * Only makes sense to check opaque dir if this is not the
		 * lowermost layer.
		if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
			opaque = true;

		if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
			     !S_ISDIR(this->d_inode->i_mode))) {
			 * FIXME: check for upper-opaqueness maybe better done
			 * in remove code.
			if (prev == upperdentry)
				upperopaque = true;
		 * If this is a non-directory then stop here.
		if (!S_ISDIR(this->d_inode->i_mode))
			opaque = true;

		stack[ctr].dentry = this;
		stack[ctr].mnt = lowerpath.mnt;
		prev = this;
		if (opaque)

	oe = ovl_alloc_entry(ctr); ///ovl_dentry for current finding dentry
	err = -ENOMEM;
	if (!oe)
		goto out_put;

	if (upperdentry || ctr) {///if got in upperdir, upperdentry != NULL; else if got in lowdir, ctr > 0
		struct dentry *realdentry;

		realdentry = upperdentry ? upperdentry : stack[0].dentry;
		///alloc overlayfs inode for current real inode
		err = -ENOMEM;
		inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
		if (!inode)
			goto out_free_oe;
		ovl_copyattr(realdentry->d_inode, inode);

	oe->opaque = upperopaque;
	oe->__upperdentry = upperdentry;
	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
	dentry->d_fsdata = oe; ///ovl_entry
	d_add(dentry, inode);

	return NULL;

	for (i = 0; i < ctr; i++)
	return ERR_PTR(err);
  • open and copy up

overlayfs在打开文件时,会让struct file->f_inode指向real inode;而且,如果会修改文件,且upperdir不存在该文件,则会从lowerdir进行copy up:

int vfs_open(const struct path *path, struct file *file,
            const struct cred *cred)
	struct dentry *dentry = path->dentry; ///overlayfs dentry
	struct inode *inode = dentry->d_inode; ///overlayfs inode

	file->f_path = *path;
	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
		inode = dentry->d_op->d_select_inode(dentry, file->f_flags); ///get real inode, ovl_dentry_operations
		if (IS_ERR(inode))
			return PTR_ERR(inode);

	return do_dentry_open(file, inode, NULL, cred); ///file->f_inode = inode

///return underlay fs inode
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
	int err;
	struct path realpath;
	enum ovl_path_type type;

	if (S_ISDIR(dentry->d_inode->i_mode))
		return dentry->d_inode;

	type = ovl_path_real(dentry, &realpath); ///real dentry
	if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { ///need copy up
		err = ovl_want_write(dentry);
		if (err)
			return ERR_PTR(err);

		if (file_flags & O_TRUNC)
			err = ovl_copy_up_truncate(dentry);
			err = ovl_copy_up(dentry); ///copy up
		if (err)
			return ERR_PTR(err);

		ovl_path_upper(dentry, &realpath);

	if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
		return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);

	return realpath.dentry->d_inode; ///return real inode

Inotify and Overlayfs

inotify_add_watch使用的是overlayfs inode:

SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
		u32, mask)

	ret = inotify_find_inode(pathname, &path, flags); ///返回overlayfs inode
	if (ret)
		goto fput_and_out;

	/* inode held in place by reference to path; group by fget on fd */
	inode = path.dentry->d_inode; ///monitored file(overlay inode)
	group = f.file->private_data; ///notify group

	/* create/update an inode mark */
	ret = inotify_update_watch(group, inode, mask);


fsnotify_open使用的是underlayfs inode:

 * fsnotify_open - file was opened
static inline void fsnotify_open(struct file *file)
	struct path *path = &file->f_path;
	struct inode *inode = file_inode(file); ///for overlayfs , after vfs_open, f->f_inode == underlay inode
	__u32 mask = FS_OPEN;

	if (S_ISDIR(inode->i_mode))
		mask |= FS_ISDIR;

	fsnotify_parent(path, NULL, mask);
	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);

vfs_open中,内核会将file->f_inode指向underlayfs inode:

int vfs_open(const struct path *path, struct file *file,
            const struct cred *cred)
	struct dentry *dentry = path->dentry; ///overlayfs dentry
	struct inode *inode = dentry->d_inode; ///overlayfs inode

	file->f_path = *path;
	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
		inode = dentry->d_op->d_select_inode(dentry, file->f_flags); ///get underlayfs inode, ovl_dentry_operations
		if (IS_ERR(inode))
			return PTR_ERR(inode);

	return do_dentry_open(file, inode, NULL, cred); ///file->f_inode = inode