YY哥 2017-11-09T08:26:43+00:00 hustcat@gmail.com Understanding the RoCE network protocol 2017-11-09T15:20:30+00:00 hustcat http://hustcat.github.io/roce-protocol RoCERDMA over Converged Ethernet的简称,基于它可以在以太网上实现RDMA.另外一种方式是RDMA over an InfiniBand.所以RoCE(严格来说是RoCEv1)是一个与InfiniBand相对应的链路层协议。

There are two RoCE versions, RoCE v1 and RoCE v2. RoCE v1 is an Ethernet link layer protocol and hence allows communication between any two hosts in the same Ethernet broadcast domain. RoCE v2 is an internet layer protocol which means that RoCE v2 packets can be routed.

RoCEv1

对于RoCE互联网络,硬件方面需要支持IEEE DCB的L2以太网交换机,计算节点需要支持RoCE的网卡:

On the hardware side, basically you need an L2 Ethernet switch with IEEE DCB (Data Center Bridging, aka Converged Enhanced Ethernet) with support for priority flow control.

 On the compute or storage server end, you need an RoCE-capable network adapter.

对应的数据帧格式如下:

对应的协议规范参考InfiniBand™ Architecture Specification Release 1.2.1 Annex A16: RoCE

示例:

RoCEv2

由于RoCEv1的数据帧不带IP头部,所以只能在L2子网内通信。所以RoCEv2扩展了RoCEv1,将GRH(Global Routing Header)换成UDP header + IP header:

RoCEv2 is a straightforward extension of the RoCE protocol that involves a simple modification of the RoCE packet format.

Instead of the GRH, RoCEv2 packets carry an IP header which allows traversal of IP L3 Routers and a UDP header that serves as a stateless encapsulation layer for the RDMA Transport Protocol Packets over IP.

数据帧的格式如下:

示例:

值得一提的是内核在4.9通过软件的方式的实现了RoCEv2,即Soft-RoCE.

Refs

]]>
Linux Soft-RoCE implementation 2017-11-08T23:20:30+00:00 hustcat http://hustcat.github.io/linux-soft-roce-implementation 内核在4.9实现的Soft-RoCE实现了RoCEv2.

队列初始化

libRXE (user space library)

ibv_create_qp
|--- rxe_create_qp
    |--- ibv_cmd_create_qp
  • ibv_create_qp
LATEST_SYMVER_FUNC(ibv_create_qp, 1_1, "IBVERBS_1.1",
		   struct ibv_qp *,
		   struct ibv_pd *pd,
		   struct ibv_qp_init_attr *qp_init_attr)
{
	struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr); ///rxe_ctx_ops
///..
}
  • rxe_create_qp
static struct ibv_qp *rxe_create_qp(struct ibv_pd *pd,
				    struct ibv_qp_init_attr *attr)
{
	struct ibv_create_qp cmd;
	struct rxe_create_qp_resp resp;
	struct rxe_qp *qp;
	int ret;
////..
	ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof cmd,
				&resp.ibv_resp, sizeof resp); /// ibv_create_qp CMD, to kernel
///...
	qp->sq.max_sge = attr->cap.max_send_sge;
	qp->sq.max_inline = attr->cap.max_inline_data;
	qp->sq.queue = mmap(NULL, resp.sq_mi.size, PROT_READ | PROT_WRITE,
			    MAP_SHARED,
			    pd->context->cmd_fd, resp.sq_mi.offset); ///mmap,参考rxe_mmap

ibv_context->cmd_fd指向对应的ibv_device,由ibv_open_device返回。

ibv_cmd_create_qp会通过ibv_context->cmd_fd给内核发送IB_USER_VERBS_CMD_CREATE_QP命令,参考libiverbs@ibv_cmd_create_qp.

对应的内核write函数为ib_uverbs_write:

///drivers/infiniband/core/uverbs_main.c
static const struct file_operations uverbs_fops = {
	.owner	 = THIS_MODULE,
	.write	 = ib_uverbs_write,
	.open	 = ib_uverbs_open,
	.release = ib_uverbs_close,
	.llseek	 = no_llseek,
};
  • ibv_open_device
///libibverbs/device.c
LATEST_SYMVER_FUNC(ibv_open_device, 1_1, "IBVERBS_1.1",
		   struct ibv_context *,
		   struct ibv_device *device)
{
	struct verbs_device *verbs_device = verbs_get_device(device);
	char *devpath;
	int cmd_fd, ret;
	struct ibv_context *context;
	struct verbs_context *context_ex;

	if (asprintf(&devpath, "/dev/infiniband/%s", device->dev_name) < 0)
		return NULL;

	/*
	 * We'll only be doing writes, but we need O_RDWR in case the
	 * provider needs to mmap() the file.
	 */
	cmd_fd = open(devpath, O_RDWR | O_CLOEXEC); /// /dev/infiniband/uverbs0
	free(devpath);

	if (cmd_fd < 0)
		return NULL;

	if (!verbs_device->ops->init_context) {
		context = verbs_device->ops->alloc_context(device, cmd_fd); ///rxe_alloc_context, rxe_dev_ops
		if (!context)
			goto err;
	}
///...
	context->device = device;
	context->cmd_fd = cmd_fd;
	pthread_mutex_init(&context->mutex, NULL);

	ibverbs_device_hold(device);

	return context;
///...
}

kernel (rdma_rxe module)

  • ib_uverbs_create_qp

IB_USER_VERBS_CMD_CREATE_QP的处理函数为函数ib_uverbs_create_qp.

ib_uverbs_write
|--- ib_uverbs_create_qp
     |--- create_qp
	      |--- ib_device->create_qp
		       |--- rxe_create_qp

create_qp调用ib_device->create_qp,对于RXE, 为函数rxe_create_qp, 参考rxe_register_device.

  • rxe_create_qp
rxe_create_qp
|--- rxe_qp_from_init
     |--- rxe_qp_init_req

rxe_qp_from_init完成发送队列和接收队列的初始化。

  • rxe_qp_init_req

rxe_qp_init_req主要做以下一些事情:

创建对应的UDP socket

调用rxe_queue_init完成发送队列的初始化.

初始化对应的tasklet

  • rxe_queue_init

rxe_queue_init给队列分配内存空间:

struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
				 int *num_elem,
				 unsigned int elem_size)
{
	struct rxe_queue *q;
	size_t buf_size;
	unsigned int num_slots;
///...
	buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size;

	q->buf = vmalloc_user(buf_size);
///...
}

rxe_queue->buf指向的内存缓冲区,由rxe_mmap映射到用户空间,队列的element对应数据结构struct rxe_send_wqe.

libiverbs API调用ibv_post_send时,会将对应的struct rxe_send_wqe加入到该队列,参考rdma-core@post_one_send.

  • rxe_mmap
/**
 * rxe_mmap - create a new mmap region
 * @context: the IB user context of the process making the mmap() call
 * @vma: the VMA to be initialized
 * Return zero if the mmap is OK. Otherwise, return an errno.
 */
int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
{
	struct rxe_dev *rxe = to_rdev(context->device);
	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
	unsigned long size = vma->vm_end - vma->vm_start;
	struct rxe_mmap_info *ip, *pp;
///...
found_it:
	list_del_init(&ip->pending_mmaps);
	spin_unlock_bh(&rxe->pending_lock);

	ret = remap_vmalloc_range(vma, ip->obj, 0);
	if (ret) {
		pr_err("rxe: err %d from remap_vmalloc_range\n", ret);
		goto done;
	}

	vma->vm_ops = &rxe_vm_ops;
	vma->vm_private_data = ip;
	rxe_vma_open(vma);
///...
}

发送数据

libRXE

rxe_post_send会将struct ibv_send_wr转成struct rxe_send_wqe,并加入到发送队列rxe_qp->rq,然后通过cmd_fd给RXE内核模块发送IB_USER_VERBS_CMD_POST_SEND命令:

///providers/rxe/rxe.c
/* this API does not make a distinction between
   restartable and non-restartable errors */
static int rxe_post_send(struct ibv_qp *ibqp,
			 struct ibv_send_wr *wr_list,
			 struct ibv_send_wr **bad_wr)
{
	int rc = 0;
	int err;
	struct rxe_qp *qp = to_rqp(ibqp);/// ibv_qp -> rxe_qp
	struct rxe_wq *sq = &qp->sq;

	if (!bad_wr)
		return EINVAL;

	*bad_wr = NULL;

	if (!sq || !wr_list || !sq->queue)
	 	return EINVAL;

	pthread_spin_lock(&sq->lock);

	while (wr_list) {
		rc = post_one_send(qp, sq, wr_list); /// ibv_send_wr -> rxe_send_wqe, enqueue
		if (rc) {
			*bad_wr = wr_list;
			break;
		}

		wr_list = wr_list->next;
	}

	pthread_spin_unlock(&sq->lock);

	err =  post_send_db(ibqp); /// IB_USER_VERBS_CMD_POST_SEND cmd
	return err ? err : rc;
}

kernel

处理的IB_USER_VERBS_CMD_POST_SEND的函数为ib_uverbs_post_send:

ib_uverbs_post_send -> ib_device->post_send -> rxe_post_send -> rxe_requester -> ip_local_out

  • rxe_post_send
static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
			 struct ib_send_wr **bad_wr)
{
	int err = 0;
	struct rxe_qp *qp = to_rqp(ibqp); ///ib_qp -> rxe_qp
///...
	/*
	 * Must sched in case of GSI QP because ib_send_mad() hold irq lock,
	 * and the requester call ip_local_out_sk() that takes spin_lock_bh.
	 */
	must_sched = (qp_type(qp) == IB_QPT_GSI) ||
			(queue_count(qp->sq.queue) > 1);

	rxe_run_task(&qp->req.task, must_sched); /// to rxe_requester

	return err;
}
  • rxe_requester

rxe_requesterrxe_qp队列取出rxe_send_wqe,生成对应的skb_buff,然后下发给对应的rxe_dev设备:

///sw/rxe/rxe_req.c
int rxe_requester(void *arg)
{
	struct rxe_qp *qp = (struct rxe_qp *)arg;
	struct rxe_pkt_info pkt;
	struct sk_buff *skb;
	struct rxe_send_wqe *wqe;
///...
	wqe = req_next_wqe(qp); /// get rxe_send_wqe
///...
	/// rxe_send_wqe -> skb
	skb = init_req_packet(qp, wqe, opcode, payload, &pkt);
///...
	ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);
///...
}

static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
				  struct rxe_pkt_info *pkt, struct sk_buff *skb)
{
///...
	if (pkt->mask & RXE_LOOPBACK_MASK) {
		memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
		err = rxe->ifc_ops->loopback(skb);
	} else {
		err = rxe->ifc_ops->send(rxe, pkt, skb);/// ifc_ops->send, send
	}
///...
}

ifc_ops->send最后会调用ip_local_out,从对应的物理NIC发送出去。

Refs

]]>
RDMA Programming - Base on linux-rdma 2017-11-08T23:00:30+00:00 hustcat http://hustcat.github.io/rdma-programming linux-rdma为Linux内核Infiniband子系统drivers/infiniband对应的用户态库,提供了Infiniband Verbs APIRDMA Verbs API.

基本概念

  • Queue Pair(QP)

为了进行RDMA操作,需要在两端建立连接,这通过Queue Pair (QP)来完成,QP相当于socket。通信的两端都需要进行QP的初始化,Communication Manager (CM) 在双方真正建立连接前交换QP信息。

Once a QP is established, the verbs API can be used to perform RDMA reads, RDMA writes, and atomic operations. Serialized send/receive operations, which are similar to socket reads/writes, can be performed as well.

QP对应数据结构struct ibv_qpibv_create_qp用于创建QP.

/**
 * ibv_create_qp - Create a queue pair.
 */
struct ibv_qp *ibv_create_qp(struct ibv_pd *pd,
			     struct ibv_qp_init_attr *qp_init_attr);
  • Completion Queue(CQ)

A Completion Queue is an object which contains the completed work requests which were posted to the Work Queues (WQ). Every completion says that a specific WR was completed (both successfully completed WRs and unsuccessfully completed WRs). A Completion Queue is a mechanism to notify the application about information of ended Work Requests (status, opcode, size, source).

对应数据结构struct ibv_cq. ibv_create_cq用于创建CQ:

/**
 * ibv_create_cq - Create a completion queue
 * @context - Context CQ will be attached to
 * @cqe - Minimum number of entries required for CQ
 * @cq_context - Consumer-supplied context returned for completion events
 * @channel - Completion channel where completion events will be queued.
 *     May be NULL if completion events will not be used.
 * @comp_vector - Completion vector used to signal completion events.
 *     Must be >= 0 and < context->num_comp_vectors.
 */
struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe,
			     void *cq_context,
			     struct ibv_comp_channel *channel,
			     int comp_vector);
  • Memory Registration (MR)

Memory Registration is a mechanism that allows an application to describe a set of virtually con- tiguous memory locations or a set of physically contiguous memory locations to the network adapter as a virtually contiguous buffer using Virtual Addresses.

对应数据结构struct ibv_mr:

struct ibv_mr {
	struct ibv_context     *context;
	struct ibv_pd	       *pd;
	void		       *addr;
	size_t			length;
	uint32_t		handle;
	uint32_t		lkey;
	uint32_t		rkey;
};

Every MR has a remote and a local key (rkey, lkey).

Local keys are used by the local HCA to access local memory, such as during a receive operation.

Remote keys are given to the remote HCA to allow a remote process access to system memory during RDMA operations.

ibv_reg_mr registers a memory region (MR), associates it with a protection domain (PD), and assigns it local and remote keys (lkey, rkey).

/**
 * ibv_reg_mr - Register a memory region
 */
struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr,
			  size_t length, int access);
  • Protection Domain (PD)

Object whose components can interact with only each other. These components can be AH, QP, MR, and SRQ.

A protection domain is used to associate Queue Pairs with Memory Regions and Memory Windows , as a means for enabling and controlling network adapter access to Host System memory.

struct ibv_pd is used to implement protection domains:

struct ibv_pd {
	struct ibv_context     *context;
	uint32_t		handle;
};

ibv_alloc_pd creates a protection domain (PD). PDs limit which memory regions can be accessed by which queue pairs (QP) providing a degree of protection from unauthorized access.

/**
 * ibv_alloc_pd - Allocate a protection domain
 */
struct ibv_pd *ibv_alloc_pd(struct ibv_context *context);
  • Send Request (SR)

An SR defines how much data will be sent, from where, how and, with RDMA, to where. struct ibv_send_wr is used to implement SRs.参考struct ibv_send_wr

示例(IB Verbs API example)

RDMA应用可以使用librdmacm或者libibverbs API编程。前者是对后者的进一步封装。

rc_pingpong是直接使用libibverbs API编程的示例。

一般来说,使用IB Verbs API的基本流程如下:

  • (1) Get the device list

First you must retrieve the list of available IB devices on the local host. Every device in this list contains both a name and a GUID. For example the device names can be: mthca0, mlx4_1.参考这里.

IB devices对应数据结构struct ibv_device:

struct ibv_device {
	struct _ibv_device_ops	_ops;
	enum ibv_node_type	node_type;
	enum ibv_transport_type	transport_type;
	/* Name of underlying kernel IB device, eg "mthca0" */
	char			name[IBV_SYSFS_NAME_MAX];
	/* Name of uverbs device, eg "uverbs0" */
	char			dev_name[IBV_SYSFS_NAME_MAX];
	/* Path to infiniband_verbs class device in sysfs */
	char			dev_path[IBV_SYSFS_PATH_MAX];
	/* Path to infiniband class device in sysfs */
	char			ibdev_path[IBV_SYSFS_PATH_MAX];
};

应用程序通过API ibv_get_device_list获取IB设备列表:

/**
 * ibv_get_device_list - Get list of IB devices currently available
 * @num_devices: optional.  if non-NULL, set to the number of devices
 * returned in the array.
 *
 * Return a NULL-terminated array of IB devices.  The array can be
 * released with ibv_free_device_list().
 */
struct ibv_device **ibv_get_device_list(int *num_devices);
  • (2) Open the requested device

Iterate over the device list, choose a device according to its GUID or name and open it.参考这里.

应用调用ibv_open_device打开IB设备:

/**
 * ibv_open_device - Initialize device for use
 */
struct ibv_context *ibv_open_device(struct ibv_device *device);

返回一个ibv_context对象:

struct ibv_context {
	struct ibv_device      *device;
	struct ibv_context_ops	ops;
	int			cmd_fd;
	int			async_fd;
	int			num_comp_vectors;
	pthread_mutex_t		mutex;
	void		       *abi_compat;
};
  • (3) Allocate a Protection Domain

分配一个PD,参考这里

A Protection Domain (PD) allows the user to restrict which components can interact with only each other.

These components can be AH, QP, MR, MW, and SRQ.

  • (4) Register a memory region

注册一个MR,参考这里.

Any memory buffer which is valid in the process’s virtual space can be registered.

During the registration process the user sets memory permissions and receives local and remote keys (lkey/rkey) which will later be used to refer to this memory buffer.

  • (5) Create a Completion Queue(CQ)

创建一个CQ,参考这里.

A CQ contains completed work requests (WR). Each WR will generate a completion queue entry (CQE) that is placed on the CQ.

The CQE will specify if the WR was completed successfully or not.

  • (6) Create a Queue Pair(QP)

创建QP,参考这里.

Creating a QP will also create an associated send queue and receive queue.

  • (7) Bring up a QP

启动QP,参考这里.

A created QP still cannot be used until it is transitioned through several states, eventually getting to Ready To Send (RTS).

This provides needed information used by the QP to be able send / receive data.

ibv_modify_qp修改QP的状态:

/**
 * ibv_modify_qp - Modify a queue pair.
 */
int ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
		  int attr_mask);

例如,对于client/server,需要将QP设置为RTS状态,参考rc_pingpong@pp_connect_ctx.

QP有如下一些状态:

RESET               Newly created, queues empty.
INIT                Basic information set. Ready for posting to receive queue.
RTR Ready to Receive. Remote address info set for connected QPs, QP may now receive packets.
RTS Ready to Send. Timeout and retry parameters set, QP may now send packets.
  • (8) Post work requests and poll for completion

Use the created QP for communication operations.

参考pp_post_sendibv_poll_cq.

  • (9) Cleanup
Destroy objects in the reverse order you created them:
Delete QP
Delete CQ
Deregister MR
Deallocate PD
Close device

测试

  • server
# ibv_rc_pingpong -d rxe0 -g 0 -s 128 -r 1 -n 1
  local address:  LID 0x0000, QPN 0x000011, PSN 0x626753, GID fe80::5054:61ff:fe57:1211
  remote address: LID 0x0000, QPN 0x000011, PSN 0x849753, GID fe80::5054:61ff:fe56:1211
256 bytes in 0.00 seconds = 11.38 Mbit/sec
1 iters in 0.00 seconds = 180.00 usec/iter
  • client
# ibv_rc_pingpong -d rxe0 -g 0 172.18.42.162 -s 128 -r 1 -n 1
  local address:  LID 0x0000, QPN 0x000011, PSN 0x849753, GID fe80::5054:61ff:fe56:1211
  remote address: LID 0x0000, QPN 0x000011, PSN 0x626753, GID fe80::5054:61ff:fe57:1211
256 bytes in 0.00 seconds = 16.13 Mbit/sec
1 iters in 0.00 seconds = 127.00 usec/iter

抓包可以查看client与server端的通信流程:

其中,第一个RC Send only为client发送给server的包,参考这里. 然后server回了一个RC Ack,并给client发送了一个RC Send only,参考这里.

前面的一些TCP包为client与server交互的控制信息,参考这里.

Refs

]]>
Multiple queue and RSS in DPDK 2017-10-17T11:00:30+00:00 hustcat http://hustcat.github.io/rss-in-dpdk RX queue

rte_eth_dev->data(对应结构体rte_eth_dev_data)保存设备的(接收/发送)队列信息:

struct rte_eth_dev_data {
	char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */

	void **rx_queues; /**< Array of pointers to RX queues. */
	void **tx_queues; /**< Array of pointers to TX queues. */
	uint16_t nb_rx_queues; /**< Number of RX queues. */
	uint16_t nb_tx_queues; /**< Number of TX queues. */
///...

rx_queues为接收队列指针数组,每个指针指向和一个具体的接收队列,以igb驱动(drivers/net/e1000)为例:

/**
 * Structure associated with each RX queue.
 */
struct igb_rx_queue {
	struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
	volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
	uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
	volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
	volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
	struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
	struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
	struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
	uint16_t            nb_rx_desc; /**< number of RX descriptors. */
	uint16_t            rx_tail;    /**< current value of RDT register. */
	uint16_t            nb_rx_hold; /**< number of held free RX desc. */
	uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
	uint16_t            queue_id;   /**< RX queue index. */
	uint16_t            reg_idx;    /**< RX queue register index. */
	uint8_t             port_id;    /**< Device port identifier. */
	uint8_t             pthresh;    /**< Prefetch threshold register. */
	uint8_t             hthresh;    /**< Host threshold register. */
	uint8_t             wthresh;    /**< Write-back threshold register. */
	uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
};

每个队列包含一个硬件描述符ring(rx_ring)和一个软件描述符ring(sw_ring),rx_ring主要由驱动与硬件使用,sw_ring实际上是是一个mbuf指针,主要由DPDK应用程序使用。

  • e1000_adv_rx_desc

硬件描述符,所有的e1000_adv_rx_desc构成一个环形DMA缓冲区。对于接收数据时,pkt_addr指向rte_mbuf->buf_physaddr,从而使得网卡收到数据时,将数据写到mbuf对应的数据缓冲区。

/* Receive Descriptor - Advanced */
union e1000_adv_rx_desc {
	struct {
		__le64 pkt_addr; /* Packet buffer address */
		__le64 hdr_addr; /* Header buffer address */
	} read; ///for receive
	struct {
		struct {
			union {
				__le32 data;
				struct {
					__le16 pkt_info; /*RSS type, Pkt type*/
					/* Split Header, header buffer len */
					__le16 hdr_info;
				} hs_rss;
			} lo_dword;
			union {
				__le32 rss; /* RSS Hash */
				struct {
					__le16 ip_id; /* IP id */
					__le16 csum; /* Packet Checksum */
				} csum_ip;
			} hi_dword;
		} lower;
		struct {
			__le32 status_error; /* ext status/error */
			__le16 length; /* Packet length */
			__le16 vlan; /* VLAN tag */
		} upper;
	} wb;  /* writeback */
};
  • igb_rx_entry

每个硬件描述符都有一个对应的软件描述符,它是DPDK应用程序与DPDK驱动之间进行数据传递的桥梁,它实际上是一个rte_mbuf的指针,rte_mbuf->buf_physaddr为DMA的物理地址,由网卡硬件使用,rte_mbuf->buf_addrbuffer的虚拟地址,由DPDK应用程序使用。

/**
 * Structure associated with each descriptor of the RX ring of a RX queue.
 */
struct igb_rx_entry {
	struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
};

/**
 * The generic rte_mbuf, containing a packet mbuf.
 */
struct rte_mbuf {
	MARKER cacheline0;

	void *buf_addr;           /**< Virtual address of segment buffer. */
	/**
	 * Physical address of segment buffer.
	 * Force alignment to 8-bytes, so as to ensure we have the exact
	 * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
	 * working on vector drivers easier.
	 */
	phys_addr_t buf_physaddr __rte_aligned(sizeof(phys_addr_t));
///...

Config queue

DPDK应用程序可以调用rte_eth_dev_configure设置Port的队列数量:

		ret = rte_eth_dev_configure(portid, nb_rx_queue,
					(uint16_t)n_tx_queue, &port_conf);

rte_eth_dev_configure会调用rte_eth_dev_rx_queue_configrte_eth_dev_tx_queue_config设置接收队列和发送队列:

rte_eth_dev_configure
|---rte_eth_dev_rx_queue_config
|---rte_eth_dev_tx_queue_config
  • config rx queue
static int
rte_eth_dev_rx_queue_config(struct rte_eth_dev *dev, uint16_t nb_queues)
{
	uint16_t old_nb_queues = dev->data->nb_rx_queues;
	void **rxq;
	unsigned i;

	if (dev->data->rx_queues == NULL && nb_queues != 0) { /* first time configuration */
		dev->data->rx_queues = rte_zmalloc("ethdev->rx_queues",
				sizeof(dev->data->rx_queues[0]) * nb_queues,
				RTE_CACHE_LINE_SIZE);
		if (dev->data->rx_queues == NULL) {
			dev->data->nb_rx_queues = 0;
			return -(ENOMEM);
		}
	}
///...

Setup queue

  • rte_eth_rx_queue_setup

DPDK application都会调用rte_eth_rx_queue_setup初始化接收队列。

int
rte_eth_rx_queue_setup(uint8_t port_id, uint16_t rx_queue_id,
		       uint16_t nb_rx_desc, unsigned int socket_id,
		       const struct rte_eth_rxconf *rx_conf,
		       struct rte_mempool *mp)
{
///...
	ret = (*dev->dev_ops->rx_queue_setup)(dev, rx_queue_id, nb_rx_desc,
					      socket_id, rx_conf, mp); ///eth_igb_ops, eth_igb_rx_queue_setup
}

eth_igb_rx_queue_setup会创建接收队列igb_rx_queue,分配RX ring hardware descriptors(e1000_adv_rx_desc)software ring(igb_rx_entry):

int
eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
			 uint16_t queue_idx,
			 uint16_t nb_desc,
			 unsigned int socket_id,
			 const struct rte_eth_rxconf *rx_conf,
			 struct rte_mempool *mp)
{
	const struct rte_memzone *rz;
	struct igb_rx_queue *rxq;
	struct e1000_hw     *hw;
	unsigned int size;

	hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
///...
	/* First allocate the RX queue data structure. */
	rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
			  RTE_CACHE_LINE_SIZE);
///...
	/*
	 *  Allocate RX ring hardware descriptors. A memzone large enough to
	 *  handle the maximum ring size is allocated in order to allow for
	 *  resizing in later calls to the queue setup function.
	 */
	size = sizeof(union e1000_adv_rx_desc) * E1000_MAX_RING_DESC;
	rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
				      E1000_ALIGN, socket_id);
///...
	rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
	rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
	rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
	rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;

	/* Allocate software ring. */
	rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
				   sizeof(struct igb_rx_entry) * nb_desc,
				   RTE_CACHE_LINE_SIZE);
}

eth_igb_rx_queue_setup主要完成DMA描述符环形队列的初始化。

RSS

  • Configure RSS with DPDK

通过rx_mode.mq_mode = ETH_MQ_RX_RSSrte_eth_dev_configure)可以开启Port的RSS,以l3fwd为例:

static struct rte_eth_conf port_conf = {
	.rxmode = {
		.mq_mode = ETH_MQ_RX_RSS,
		.max_rx_pkt_len = ETHER_MAX_LEN,
		.split_hdr_size = 0,
		.header_split   = 0, /**< Header Split disabled */
		.hw_ip_checksum = 1, /**< IP checksum offload enabled */
		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
	},
	.rx_adv_conf = {
		.rss_conf = {
			.rss_key = NULL,
			.rss_hf = ETH_RSS_IP,
		},
	},
	.txmode = {
		.mq_mode = ETH_MQ_TX_NONE,
	},
};
  • Driver(igb) config RSS

eth_igb_start -> eth_igb_rx_init -> igb_dev_mq_rx_configure

//drivers/net/e1000/igb_rxtx.c
static int
igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
{
	struct e1000_hw *hw =
		E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
	uint32_t mrqc;

	if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
		/*
		 * SRIOV active scheme
		 * FIXME if support RSS together with VMDq & SRIOV
		 */
		mrqc = E1000_MRQC_ENABLE_VMDQ;
		/* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
		mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
		E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
	} else if(RTE_ETH_DEV_SRIOV(dev).active == 0) { ///disable SRIOV
		/*
		 * SRIOV inactive scheme
		 */
		switch (dev->data->dev_conf.rxmode.mq_mode) {
			case ETH_MQ_RX_RSS:
				igb_rss_configure(dev); ///RSS
				break;
///...
}

static void
igb_rss_configure(struct rte_eth_dev *dev)
{
///...
	if (rss_conf.rss_key == NULL)
		rss_conf.rss_key = rss_intel_key; /* Default hash key */
	igb_hw_rss_hash_set(hw, &rss_conf);
}

Refs

]]>
KNI in DPDK 2017-10-11T23:00:30+00:00 hustcat http://hustcat.github.io/kni-in-dpdk 介绍

The Kernel NIC Interface (KNI) is a DPDK control plane solution that allows userspace applications to exchange packets with the kernel networking stack. To accomplish this, DPDK userspace applications use an IOCTL call to request the creation of a KNI virtual device in the Linux* kernel. The IOCTL call provides interface information and the DPDK’s physical address space, which is re-mapped into the kernel address space by the KNI kernel loadable module that saves the information to a virtual device context. The DPDK creates FIFO queues for packet ingress and egress to the kernel module for each device allocated.

The KNI kernel loadable module is a standard net driver, which upon receiving the IOCTL call access the DPDK’s FIFO queue to receive/transmit packets from/to the DPDK userspace application. The FIFO queues contain pointers to data packets in the DPDK. This:

  • Provides a faster mechanism to interface with the kernel net stack and eliminates system calls

  • Facilitates the DPDK using standard Linux* userspace net tools (tcpdump, ftp, and so on)

  • Eliminate the copy_to_user and copy_from_user operations on packets.

测试

Load KNI kernel module:

# insmod /root/dpdk/x86/lib/modules/3.10.0-514.el7.x86_64/extra/dpdk/rte_kni.ko

Build KNI application:

# export RTE_SDK=/root/dpdk/x86/share/dpdk
# cd examples/kni
# make
  CC main.o
  LD kni
  INSTALL-APP kni
  INSTALL-MAP kni.map

Run KNI application:

# build/kni -c 0x0f -n 2 -- -P -p 0x3 --config="(0,0,1),(1,2,3)" 
EAL: Detected 4 lcore(s)
EAL: No free hugepages reported in hugepages-1048576kB
EAL: Probing VFIO support...
EAL: WARNING: cpu flags constant_tsc=yes nonstop_tsc=no -> using unreliable clock cycles !
EAL: PCI device 0000:00:05.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:06.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:07.0 on NUMA socket -1
EAL:   probe driver: 8086:100e net_e1000_em
APP: Initialising port 0 ...
KNI: pci: 00:06:00       8086:100e
APP: Initialising port 1 ...
KNI: pci: 00:07:00       8086:100e

Checking link status
.....done
Port 0 Link Up - speed 1000 Mbps - full-duplex
Port 1 Link Up - speed 1000 Mbps - full-duplex
APP: Lcore 1 is writing to port 0
APP: Lcore 2 is reading from port 1
APP: Lcore 0 is reading from port 0
APP: Lcore 3 is writing to port 1
...

其中,

  • -c = core bitmask

  • -P = promiscuous mode

  • -p = port hex bitmask

  • –config=”(port, lcore_rx, lcore_tx [,lcore_kthread, …]) …”

Note that each core can do either TX or RX for one port only.

[root@vm01 ~]# ip a
...
7: vEth0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
    link/ether ba:92:66:e5:2f:35 brd ff:ff:ff:ff:ff:ff
8: vEth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
    link/ether b2:64:67:2f:32:4a brd ff:ff:ff:ff:ff:ff
[root@vm01 ~]# ip addr add 10.0.10.30/24 dev vEth0
[root@vm01 ~]# ip link set vEth0 up
[root@vm03 ~]# ping -c 3 10.0.10.30 
PING 10.0.10.30 (10.0.10.30) 56(84) bytes of data.
64 bytes from 10.0.10.30: icmp_seq=1 ttl=64 time=14.2 ms
64 bytes from 10.0.10.30: icmp_seq=2 ttl=64 time=2.96 ms
64 bytes from 10.0.10.30: icmp_seq=3 ttl=64 time=1.89 ms

给kni应用进程发送SIGUSR1信号,kni应用进程会输出统计信息:

[root@vm01 ~]# pkill -10 kni

...
**KNI example application statistics**
======  ==============  ============  ============  ============  ============
 Port    Lcore(RX/TX)    rx_packets    rx_dropped    tx_packets    tx_dropped
------  --------------  ------------  ------------  ------------  ------------
      0          0/ 1            23             0             5             0
      1          2/ 3             1             0             0             0
======  ==============  ============  ============  ============  ============

实现

相关代码:

KNI示例程序位于example/kni,KNI内核模块位于lib/librte_eal/linuxapp/kni,KNI library位于lib/librte_kni

整体实现如下:

数据接收

先调用rte_eth_rx_burst从网络接口读取数据,然后调用rte_kni_tx_burst通过FIFO传给内核模块。

		/* Burst rx from eth */
		nb_rx = rte_eth_rx_burst(port_id, 0, pkts_burst, PKT_BURST_SZ);

		/* Burst tx to kni */
		num = rte_kni_tx_burst(p->kni[i], pkts_burst, nb_rx);

rte_kni_tx_burst:

///librte_kni
unsigned
rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
{
	void *phy_mbufs[num];
	unsigned int ret;
	unsigned int i;

	for (i = 0; i < num; i++)
		phy_mbufs[i] = va2pa(mbufs[i]);

	ret = kni_fifo_put(kni->rx_q, phy_mbufs, num);

	/* Get mbufs from free_q and then free them */
	kni_free_mbufs(kni);

	return ret;
}

/**
 * Adds num elements into the fifo. Return the number actually written
 */
static inline unsigned
kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
{
	unsigned i = 0;
	unsigned fifo_write = fifo->write;
	unsigned fifo_read = fifo->read;
	unsigned new_write = fifo_write;

	for (i = 0; i < num; i++) {
		new_write = (new_write + 1) & (fifo->len - 1);

		if (new_write == fifo_read)
			break;
		fifo->buffer[fifo_write] = data[i];
		fifo_write = new_write;
	}
	fifo->write = fifo_write;
	return i;
}
  • fifo

DPDK应用通过fifo与内核模块交换数据,fifo实际上是一块环形共享内存:

/*
 * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
 * Write and read should wrap around. Fifo is empty when write == read
 * Writing should never overwrite the read position
 */
struct rte_kni_fifo {
	volatile unsigned write;     /**< Next position to be written*/
	volatile unsigned read;      /**< Next position to be read */
	unsigned len;                /**< Circular buffer length */
	unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
	void *volatile buffer[];     /**< The buffer contains mbuf pointers */
};

DPDK应用程序在初始化时,需要将fifo共享内存的地址告诉KNI内核模块:

struct rte_kni *
rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
	      const struct rte_kni_conf *conf,
	      struct rte_kni_ops *ops)
{
///...
	/* TX RING */
	mz = slot->m_tx_q;
	ctx->tx_q = mz->addr;
	kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
	dev_info.tx_phys = mz->phys_addr;

	/* RX RING */
	mz = slot->m_rx_q;
	ctx->rx_q = mz->addr;
	kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
	dev_info.rx_phys = mz->phys_addr;
///...
	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info); ///内核模块
    
}

KNI kernel module

static int
kni_ioctl(struct inode *inode, uint32_t ioctl_num, unsigned long ioctl_param)
{
///..
	case _IOC_NR(RTE_KNI_IOCTL_CREATE):
		ret = kni_ioctl_create(net, ioctl_num, ioctl_param);

kni_ioctl_create会创建对应的网络设备vEthX,然后设置对应的fifo共享内存,并启动对应的内核线程:

static int
kni_ioctl_create(struct net *net, uint32_t ioctl_num,
		unsigned long ioctl_param)
{
///...
	net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name,
#ifdef NET_NAME_USER
							NET_NAME_USER,
#endif
							kni_net_init);
///...
	/* Translate user space info into kernel space info */
	kni->tx_q = phys_to_virt(dev_info.tx_phys);
	kni->rx_q = phys_to_virt(dev_info.rx_phys);
	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
	kni->free_q = phys_to_virt(dev_info.free_phys);
///...
	ret = kni_run_thread(knet, kni, dev_info.force_bind);
///...
}

kernel thread:

KNI内核线程不断从fifo共享内存读取数据,然后交给内核协议栈继续处理:

static int
kni_thread_single(void *data)
{
	struct kni_net *knet = data;
	int j;
	struct kni_dev *dev;

	while (!kthread_should_stop()) {
		down_read(&knet->kni_list_lock);
		for (j = 0; j < KNI_RX_LOOP_NUM; j++) {
			list_for_each_entry(dev, &knet->kni_list_head, list) {
				kni_net_rx(dev);
				kni_net_poll_resp(dev);
			}
		}
		up_read(&knet->kni_list_lock);
///...
}

/* rx interface */
void
kni_net_rx(struct kni_dev *kni)
{
	/**
	 * It doesn't need to check if it is NULL pointer,
	 * as it has a default value
	 */
	(*kni_net_rx_func)(kni); ///kni_net_rx_normal
}
  • kni_net_rx_func

kni_net_rx_funcfifo共享内存读取数据,然后分配skb,拷贝数据,调用netif_rx_ni进入内核协议栈:

static void kni_net_rx_normal(struct kni_dev *kni);

/* kni rx function pointer, with default to normal rx */
static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
/*
 * RX: normal working mode
 */
static void
kni_net_rx_normal(struct kni_dev *kni)
{
///...
	/* Calculate the number of entries to dequeue from rx_q */
	num_rx = min_t(uint32_t, num_fq, MBUF_BURST_SZ);

	/* Burst dequeue from rx_q */
	num_rx = kni_fifo_get(kni->rx_q, kni->pa, num_rx);
	if (num_rx == 0)
		return;
///...
	/* Transfer received packets to netif */
	for (i = 0; i < num_rx; i++) {
		kva = pa2kva(kni->pa[i]);
		len = kva->pkt_len;
		data_kva = kva2data_kva(kva);
		kni->va[i] = pa2va(kni->pa[i], kva);

		skb = dev_alloc_skb(len + 2);
///...
		skb->dev = dev;
		skb->protocol = eth_type_trans(skb, dev);
		skb->ip_summed = CHECKSUM_UNNECESSARY;

		/* Call netif interface */
		netif_rx_ni(skb); ///进入内核协议栈
///...
	}
///...
}

发送数据

  • KNI kernel interface
/*
 * Transmit a packet (called by the kernel)
 */
static int
kni_net_tx(struct sk_buff *skb, struct net_device *dev)
{
///...
	/* dequeue a mbuf from alloc_q */
	ret = kni_fifo_get(kni->alloc_q, &pkt_pa, 1);
	if (likely(ret == 1)) {
		void *data_kva;

		pkt_kva = pa2kva(pkt_pa);
		data_kva = kva2data_kva(pkt_kva);
		pkt_va = pa2va(pkt_pa, pkt_kva);

		len = skb->len; /// data length
		memcpy(data_kva, skb->data, len); /// copy data
		if (unlikely(len < ETH_ZLEN)) {
			memset(data_kva + len, 0, ETH_ZLEN - len);
			len = ETH_ZLEN;
		}
		pkt_kva->pkt_len = len;
		pkt_kva->data_len = len;

		/* enqueue mbuf into tx_q */
		ret = kni_fifo_put(kni->tx_q, &pkt_va, 1);/// put tx_q
///...
  • DPDK app
		/* Burst rx from kni */
		num = rte_kni_rx_burst(p->kni[i], pkts_burst, PKT_BURST_SZ);

		/* Burst tx to eth */
		nb_tx = rte_eth_tx_burst(port_id, 0, pkts_burst, (uint16_t)num);

rte_kni_rx_bursttx_q队列取出mbuf:

unsigned
rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
{
	unsigned ret = kni_fifo_get(kni->tx_q, (void **)mbufs, num);

	/* If buffers removed, allocate mbufs and then put them into alloc_q */
	if (ret)
		kni_allocate_mbufs(kni);

	return ret;
}

Refs

]]>
Introduction to the UIO 2017-10-10T23:00:30+00:00 hustcat http://hustcat.github.io/introduction-to-uio UIO

每个UIO设备可以通过设备文件(/dev/uioX)和sysfs的属性文件来访问。

可以通过mmap映射/dev/uioX来访问UIO设备的寄存器或者RAM。

直接read /dev/uioX来获取UIO设备的中断,read()会被阻塞,发生中断时就会返回。

Each UIO device is accessed through a device file and several sysfs attribute files. The device file will be called /dev/uio0 for the first device, and /dev/uio1, /dev/uio2 and so on for subsequent devices.

/dev/uioX is used to access the address space of the card. Just use mmap() to access registers or RAM locations of your card.

Interrupts are handled by reading from /dev/uioX. A blocking read() from /dev/uioX will return as soon as an interrupt occurs. You can also use select() on /dev/uioX to wait for an interrupt. The integer value read from /dev/uioX represents the total interrupt count. You can use this number to figure out if you missed some interrupts.

uio driver

uio_pci_generic

UIO设备需要UIO内核驱动的支持,uio_pci_generic是一个通用的PCI UIO设备的内核驱动。

UIO does not completely eliminate the need for kernel-space code. A small module is required to set up the device, perhaps interface to the PCI bus, and register an interrupt handler. The last function (interrupt handling) is particularly important; much can be done in user space, but there needs to be an in-kernel interrupt handler which knows how to tell the device to stop crying for attention.

///drivers/uio/uio_pci_generic.c
static struct pci_driver driver = {
	.name = "uio_pci_generic",
	.id_table = NULL, /* only dynamic id's */
	.probe = probe,
	.remove = remove,
};

interrupt

uio_register_device 注册UIO驱动时时,会注册中断处理函数uio_interrupt:

/**
 * uio_register_device - register a new userspace IO device
 * @owner:	module that creates the new device
 * @parent:	parent device
 * @info:	UIO device capabilities
 *
 * returns zero on success or a negative error code.
 */
int __uio_register_device(struct module *owner,
			  struct device *parent,
			  struct uio_info *info)
{
///...
	if (info->irq && (info->irq != UIO_IRQ_CUSTOM)) {
		ret = request_irq(info->irq, uio_interrupt,
				  info->irq_flags, info->name, idev);
		if (ret)
			goto err_request_irq;
	}
}

/**
 * uio_interrupt - hardware interrupt handler
 * @irq: IRQ number, can be UIO_IRQ_CYCLIC for cyclic timer
 * @dev_id: Pointer to the devices uio_device structure
 */
static irqreturn_t uio_interrupt(int irq, void *dev_id)
{
	struct uio_device *idev = (struct uio_device *)dev_id;
	irqreturn_t ret = idev->info->handler(irq, idev->info); ///irqhandler

	if (ret == IRQ_HANDLED)
		uio_event_notify(idev->info); ///notify userspace

	return ret;
}
  • uio_event_notify

UIO设备每次发生中断时,都会增加uio_device->event,然后唤醒等待进程

/** notify userspace application
 * uio_event_notify - trigger an interrupt event
 * @info: UIO device capabilities
 */
void uio_event_notify(struct uio_info *info)
{
	struct uio_device *idev = info->uio_dev;

	atomic_inc(&idev->event);
	wake_up_interruptible(&idev->wait); ///wake up waiting process
	kill_fasync(&idev->async_queue, SIGIO, POLL_IN);
}
  • read /dev/uioX

每次调用open /dev/uioX时,内核都会创建一个uio_listener,关联到struct file->private_data:

static int uio_open(struct inode *inode, struct file *filep)
{
	struct uio_device *idev;
	struct uio_listener *listener;
	int ret = 0;

	mutex_lock(&minor_lock);
	idev = idr_find(&uio_idr, iminor(inode));
	mutex_unlock(&minor_lock);
///...
	listener = kmalloc(sizeof(*listener), GFP_KERNEL);
	if (!listener) {
		ret = -ENOMEM;
		goto err_alloc_listener;
	}

	listener->dev = idev;
	listener->event_count = atomic_read(&idev->event);
	filep->private_data = listener;

然后每次read()时,发现uio_listener->event_countuio_device->event不相同时,就意味着发生中断,会返回对应的值;否则,将当前进程加入到uio_device->wait队列,然后挂起当前进程:

static ssize_t uio_read(struct file *filep, char __user *buf,
			size_t count, loff_t *ppos)
{
	struct uio_listener *listener = filep->private_data;
	struct uio_device *idev = listener->dev;
	DECLARE_WAITQUEUE(wait, current);
///...
	add_wait_queue(&idev->wait, &wait);

	do {
		set_current_state(TASK_INTERRUPTIBLE);

		event_count = atomic_read(&idev->event);
		if (event_count != listener->event_count) { ///irq happened
			if (copy_to_user(buf, &event_count, count))
				retval = -EFAULT;
			else {
				listener->event_count = event_count;
				retval = count;
			}
			break;
		}

		if (filep->f_flags & O_NONBLOCK) {
			retval = -EAGAIN;
			break;
		}

		if (signal_pending(current)) {
			retval = -ERESTARTSYS;
			break;
		}
		schedule();
	} while (1);

	__set_current_state(TASK_RUNNING);
	remove_wait_queue(&idev->wait, &wait);

	return retval;
}
  • uio_mmap

当用户态driver调用mmap映射/dev/uioX时,内核会调用uio_mmap映射UIO设备的RAM:

static int uio_mmap(struct file *filep, struct vm_area_struct *vma)
{
	struct uio_listener *listener = filep->private_data;
	struct uio_device *idev = listener->dev;
	int mi;
	unsigned long requested_pages, actual_pages;
	int ret = 0;

	if (vma->vm_end < vma->vm_start)
		return -EINVAL;

	vma->vm_private_data = idev;

	mi = uio_find_mem_index(vma);
	if (mi < 0)
		return -EINVAL;

	requested_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
	actual_pages = ((idev->info->mem[mi].addr & ~PAGE_MASK)
			+ idev->info->mem[mi].size + PAGE_SIZE -1) >> PAGE_SHIFT;
	if (requested_pages > actual_pages)
		return -EINVAL;

	if (idev->info->mmap) {
		ret = idev->info->mmap(idev->info, vma);
		return ret;
	}

	switch (idev->info->mem[mi].memtype) {
		case UIO_MEM_PHYS:
			return uio_mmap_physical(vma);
		case UIO_MEM_LOGICAL:
		case UIO_MEM_VIRTUAL:
			return uio_mmap_logical(vma);
		default:
			return -EINVAL;
	}
}

更多关于mmap参考Linux MMAP & Ioremap introduction.

userspace driver

fd = open(/dev/uio0, O_RDWR|O_SYNC);
/* Map device's registers into user memory */
/* fitting the memory area on pages */
offset = addr & ~PAGE_MASK;
addr = 0 /* region 0 */ * PAGE_SIZE;
size = (size + PAGE_SIZE ­ 1) / PAGE_SIZE * PAGE_SIZE;
iomem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, 
fd, addr);
iomem += offset;
/* Stop the counting */
*(u_char *)SH_TMU_TSTR(iomem) |= ~(TSTR_TSTR2);
...
/* Wait for an interrupt */;
read(fd, &n_pending, sizeof(u_long));
val = *(u_int *)SH_TMU2_TCNT(iomem);
...
/* Stop the TMU */
*(u_char *)SH_TMU_TSTR(iomem) &= ~(TSTR_TSTR2);
munmap(iomem, size);
close(fd);

详细参考 Using UIO in an embedded platform.

参考

]]>
Checksum in Linux Kernel 2017-08-15T17:00:30+00:00 hustcat http://hustcat.github.io/checksum-in-kernel calculate IP/TCP/UDP checsum

内核计算IP/TCPU/UDP的校验和的方法,参考How to Calculate IP/TCP/UDP Checksum–Part 1 Theory.

简单来说,就是对要计算的数据,以16bit为单元进行累加,然后取反。

TCP收包时,检查校验和:

static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
{
	const struct iphdr *iph = ip_hdr(skb);

	if (skb->ip_summed == CHECKSUM_COMPLETE) {
		if (!tcp_v4_check(skb->len, iph->saddr, ///check TCP/UDP pseudo-header checksum
				  iph->daddr, skb->csum)) {
			skb->ip_summed = CHECKSUM_UNNECESSARY;
			return 0;
		}
	}

	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
				       skb->len, IPPROTO_TCP, 0); ///calc pseudo header checksum

	if (skb->len <= 76) {
		return __skb_checksum_complete(skb); /// 基于伪头累加和,计算整个数据包的checksum
	}
	return 0;
}

csum_tcpudp_nofold用于计算伪头的checksum,__skb_checksum_complete基于伪头累加和(skb->csum)计算整个skb的校验和。

net_device->features

net_device->features字段表示设备的各种特性。其中一些位用于表示硬件校验和的计算能力:

#define NETIF_F_IP_CSUM		__NETIF_F(HW_CSUM)
#define NETIF_F_IP_CSUM		__NETIF_F(IP_CSUM) ///ipv4 + TCP/UDP
#define NETIF_F_IPV6_CSUM	__NETIF_F(IPV6_CSUM)

NETIF_F_IP_CSUM表示硬件可以计算L4 checksum,但是只针对IPV4的TCP和UDP。但是一些设备扩展支持VXLAN和NVGRE。 NETIF_F_IP_CSUM是一种协议感知的计算checksum的方法。具体来说,上层提供两个CSUM的参数(csum_startcsum_offset)。

NETIF_F_HW_CSUM is a protocol agnostic method to offload the transmit checksum. In this method the host provides checksum related parameters in a transmit descriptor for a packet. These parameters include the starting offset of data to checksum and the offset in the packet where the computed checksum is to be written. The length of data to checksum is implicitly the length of the packet minus the starting offset.

值得一提的是,igb/ixgbe使用的NETIF_F_IP_CSUM.

sk_buff

取决于skb是接收封包,还是发送封包,skb->csumskb->ip_summed的意义会不同。

/*
 *	@csum: Checksum (must include start/offset pair)
 *	@csum_start: Offset from skb->head where checksumming should start
 *	@csum_offset: Offset from csum_start where checksum should be stored
 *	@ip_summed: Driver fed us an IP checksum
 */
struct sk_buff {
	union {
		__wsum		csum;
		struct {
			__u16	csum_start;
			__u16	csum_offset;
		};
	};

	__u8			local_df:1,
				cloned:1,
				ip_summed:2,
				nohdr:1,
				nfctinfo:3;

skb->ip_summed一般的取值:

/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE 0
#define CHECKSUM_UNNECESSARY 1 ///hardware verified the checksums
#define CHECKSUM_COMPLETE 2
#define CHECKSUM_PARTIAL 3 ///only compute IP header, not include data

接收时的CSUM

对于接收包,skb->csum可能包含L4校验和。skb->ip_summed表述L4校验和的状态:

  • (1) CHECKSUM_UNNECESSARY

CHECKSUM_UNNECESSARY表示底层硬件已经计算了CSUM,以igb驱动为例:

igb_poll -> igb_clean_rx_irq -> igb_process_skb_fields -> igb_rx_checksum:

static inline void igb_rx_checksum(struct igb_ring *ring,
				   union e1000_adv_rx_desc *rx_desc,
				   struct sk_buff *skb)
{
///...
	/* Rx checksum disabled via ethtool */
	if (!(ring->netdev->features & NETIF_F_RXCSUM)) ///关闭RXCSUM
		return;

	/* TCP/UDP checksum error bit is set */
	if (igb_test_staterr(rx_desc,
			     E1000_RXDEXT_STATERR_TCPE |
			     E1000_RXDEXT_STATERR_IPE)) {
		/* work around errata with sctp packets where the TCPE aka
		 * L4E bit is set incorrectly on 64 byte (60 byte w/o crc)
		 * packets, (aka let the stack check the crc32c)
		 */
		if (!((skb->len == 60) &&
		      test_bit(IGB_RING_FLAG_RX_SCTP_CSUM, &ring->flags))) {
			u64_stats_update_begin(&ring->rx_syncp);
			ring->rx_stats.csum_err++;
			u64_stats_update_end(&ring->rx_syncp);
		}
		/* let the stack verify checksum errors,交给协议栈进一步验证csum */
		return;
	}
	/* It must be a TCP or UDP packet with a valid checksum */
	if (igb_test_staterr(rx_desc, E1000_RXD_STAT_TCPCS |
				      E1000_RXD_STAT_UDPCS))
		skb->ip_summed = CHECKSUM_UNNECESSARY; ///stack don't needed verify
}

TCP层在收到包后,发现skb->ip_summedCHECKSUM_UNNECESSARY就不会再检查checksum了:

int tcp_v4_rcv(struct sk_buff *skb)
{
///...
	/* An explanation is required here, I think.
	 * Packet length and doff are validated by header prediction,
	 * provided case of th->doff==0 is eliminated.
	 * So, we defer the checks. */
	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
		goto csum_error;
///...
}

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
	return skb->ip_summed & CHECKSUM_UNNECESSARY;
}
  • (2) CHECKSUM_NONE

csum中的校验和无效,可能有以下几种原因:

  • 设备不支持硬件校验和计算;

  • 设备计算了硬件校验和,但发现该数据帧已经损坏。此时,设备驱动程序可以直接丢弃该数据帧。但有些设备驱动程序(比如e10000/igb/ixbge)却没有丢弃数据帧,而是将ip_summed设置为CHECKSUM_NONE,然后交给上层协议栈重新计算并处理这种错误。

  • (3) CHECKSUM_COMPLETE

表明网卡已经计算了L4层报头和payload的校验和,并且skb->csum已经被赋值,此时L4层的接收者只需要加伪头并验证校验结果。以TCP为例:

static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
{
	const struct iphdr *iph = ip_hdr(skb);

	if (skb->ip_summed == CHECKSUM_COMPLETE) {
		if (!tcp_v4_check(skb->len, iph->saddr, ///check TCP/UDP pseudo-header checksum
				  iph->daddr, skb->csum)) {
			skb->ip_summed = CHECKSUM_UNNECESSARY;
			return 0;
		}
	}
///...
}

值得一提的,igb/ixgbe没有使用CHECKSUM_COMPLETE,而是使用的CHECKSUM_UNNECESSARY.

注意CHECKSUM_COMPLETECHECKSUM_UNNECESSARY的区别,对于前者,上层还需要计算伪头校验和,再进行验证,见tcp_v4_check。实际上,早前的内核版本为CHECKSUM_HW

  • Veth的BUG

这里讨论一个有意思的问题:Linux kernel bug delivers corrupt TCP/IP data to Mesos, Kubernetes, Docker containers.

Veth设备会将CHECKSUM_NONE改为CHECKSUM_UNNECESSARY。这样,就会导致硬件收到损坏的数据帧后,转给veth后,却变成了CHECKSUM_UNNECESSARY,上层协议栈(TCP)就不会再计算检查数据包的校验和了。

static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
///...

	/* don't change ip_summed == CHECKSUM_PARTIAL, as that
	 * will cause bad checksum on forwarded packets
	 */
	if (skb->ip_summed == CHECKSUM_NONE &&
	    rcv->features & NETIF_F_RXCSUM)
		skb->ip_summed = CHECKSUM_UNNECESSARY;
}

veth最初是用于本地通信的设备,一般来说,本地的数据帧不太可能发生损坏。在发送数据时,如果协议栈已经计算校验和,会将skb->ip_summed设置为CHECKSUM_NONE。所以,对于veth本机通信,接收端没有必要再计算校验和。但是,对于容器虚拟化场景,veth的数据包可能来自网络,如果还这样设置,就会导致损坏的数据帧传给应用层。

发送时CSUM

同样,对于发送包,skb->ip_summed用于L4校验和的状态,以通知底层网卡是否还需要处理校验和:

  • (1) CHECKSUM_NONE

此时,CHECKSUM_NONE表示协议栈已经计算了校验和,设备不需要做任何事情。

  • (2) CHECKSUM_PARTIAL

CHECKSUM_PARTIAL表示使用硬件checksum ,协议栈已经计算L4层的伪头的校验和,并且已经加入uh->check字段中,此时只需要设备计算整个头4层头的校验值。

int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
		size_t size)
{
///...

				/*
				 * Check whether we can use HW checksum.
				 */
				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
					skb->ip_summed = CHECKSUM_PARTIAL;
}


static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
{
///...
	icsk->icsk_af_ops->send_check(sk, skb); ///tcp_v4_send_check
}


static void __tcp_v4_send_check(struct sk_buff *skb,
				__be32 saddr, __be32 daddr)
{
	struct tcphdr *th = tcp_hdr(skb);

	if (skb->ip_summed == CHECKSUM_PARTIAL) { ///HW CSUM
		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); ///add IPv4 pseudo header checksum
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct tcphdr, check);
	} else {
		th->check = tcp_v4_check(skb->len, saddr, daddr,
					 csum_partial(th,
						      th->doff << 2,
						      skb->csum)); ///ip_summed == CHECKSUM_NONE
	}
}

/* This routine computes an IPv4 TCP checksum. */
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
	const struct inet_sock *inet = inet_sk(sk);

	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
  • dev_queue_xmit

最后在dev_queue_xmit发送的时候发现设备不支持硬件checksum还会进行软件计算(是否会走这里?):

int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
			struct netdev_queue *txq)
{
///...
		if (netif_needs_gso(skb, features)) {
			if (unlikely(dev_gso_segment(skb, features))) ///GSO(software offload)
				goto out_kfree_skb;
			if (skb->next)
				goto gso;
		} else { ///hardware offload
			if (skb_needs_linearize(skb, features) &&
			    __skb_linearize(skb))
				goto out_kfree_skb;

			/* If packet is not checksummed and device does not
			 * support checksumming for this protocol, complete
			 * checksumming here.
			 */
			if (skb->ip_summed == CHECKSUM_PARTIAL) { ///only header csum is computed
				if (skb->encapsulation)
					skb_set_inner_transport_header(skb,
						skb_checksum_start_offset(skb));
				else
					skb_set_transport_header(skb,
						skb_checksum_start_offset(skb));
				if (!(features & NETIF_F_ALL_CSUM) && ///check hardware if support offload
				     skb_checksum_help(skb)) ///HW not support CSUM
					goto out_kfree_skb;
			}
		}
}

ip_summed==CHECKSUM_PARTIAL表示协议栈并没有计算完校验和,只计算了伪头,将传输层的数据部分留给了硬件进行计算。如果底层硬件不支持CSUM,则skb_checksum_help完成计算校验和。

Remote checksum

TODO:

相关资料

]]>
Mount namespace and mount propagation 2017-03-10T15:00:30+00:00 hustcat http://hustcat.github.io/mount-namespace-and-mount-propagation Mount namespace and problems

When a new mount namespace is created, it receives a copy of the mount point list replicated from the namespace of the caller of clone() or unshare().

create_new_namespaces -> copy_mnt_ns -> dup_mnt_ns:

/*
 * Allocate a new namespace structure and populate it with contents
 * copied from the namespace of the passed in task structure.
 */
static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
		struct fs_struct *fs)
{
	struct mnt_namespace *new_ns;
	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
	struct vfsmount *p, *q;

	new_ns = alloc_mnt_ns();
	if (IS_ERR(new_ns))
		return new_ns;

	down_write(&namespace_sem);
	/* First pass: copy the tree topology */
	new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root,
					CL_COPY_ALL | CL_EXPIRE); ///拷贝原来namespace所有的文件系统
 ///...

每个Mount namespace有自己独立的文件系统视图,但是这种隔离性同时也带来一些问题:比如,当系统加载一块新的磁盘时,在最初的实现中,每个namespace必须单独挂载磁盘,才能见到。很多时候,我们希望挂载一次,就能在所有的mount namespace可见。为此,内核在2.6.15引入了shared subtrees feature

The key benefit of shared subtrees is to allow automatic, controlled propagation of mount and unmount events between namespaces. This means, for example, that mounting an optical disk in one mount namespace can trigger a mount of that disk in all other namespaces.

为了支持shared subtrees feature,每个挂载点都会标记propagation type,用于决定在当前挂载点下创建/删除(子)挂载点时,是否传播到别的挂载点。

propagation type

内核有以下几种传播类型:

  • MS_SHARED

This mount point shares mount and unmount events with other mount points that are members of its “peer group”. When a mount point is added or removed under this mount point, this change will propagate to the peer group, so that the mount or unmount will also take place under each of the peer mount points. Propagation also occurs in the reverse direction, so that mount and unmount events on a peer mount will also propagate to this mount point.

  • MS_PRIVATE

This is the converse of a shared mount point. The mount point does not propagate events to any peers, and does not receive propagation events from any peers.

  • MS_SLAVE

This propagation type sits midway between shared and private. A slave mount has a master—a shared peer group whose members propagate mount and unmount events to the slave mount. However, the slave mount does not propagate events to the master peer group.

  • MS_UNBINDABLE

This mount point is unbindable. Like a private mount point, this mount point does not propagate events to or from peers. In addition, this mount point can’t be the source for a bind mount operation.

几点注意事项:

(1) propagation type是对每个挂载点的设置.

(2) propagation type决定挂载点的直属(immediately under)子挂载点mount/umount事件的传播.比如,挂载点X下创建新的挂载点Y,Y会扩展到X的peer group,但是X不会影响Y下面的子挂载点。

(3)

Peer groups

peer group就是一些可以相互传播mount/umount事件的挂载点集合. 对于shared挂载点,当创建新的mount namspace或者作为bind mount的源目标时,就会创建新的成员。这两种情况都会创建新的挂载点,新的挂载点与原来的挂载点构成peer group。同理,当mount namespace释放时,或者挂载点umount时,会从对应的peer group删除。

A peer group is a set of mount points that propagate mount and unmount events to one another. A peer group acquires new members when a mount point whose propagation type is shared is either replicated during the creation of a new namespace or is used as the source for a bind mount. In both cases, the new mount point is made a member of the same peer group as the existing mount point. Conversely, a mount point ceases to be a member of a peer group when it is unmounted, either explicitly, or implicitly when a mount namespace is torn down because the last member process terminates or moves to another namespace.

  • 示例

在sh1执行:将/设置为private,并创建2个shared的挂载点:

sh1# mount --make-private / 
sh1# mount --make-shared /dev/sda3 /X 
sh1# mount --make-shared /dev/sda5 /Y 

然后在sh2执行:创建新的mount namespace:

sh2# unshare -m --propagation unchanged sh 

然后再在sh1执行:X bind mount to Z:

sh1# mkdir /Z 
sh1# mount --bind /X /Z 

这会创建2个peer group:

  • 第一个peer group包含X, X’, 和 Z。其中,X和X’是因为namespace的创建,X和Z是因为bind mount产生的。
  • 第二个peer group只包含Y, Y’。

注意,因为/private的,所以,bind mount Z并不会传播到第二个namespace。

来看Docker使用private的代码:

// InitializeMountNamespace sets up the devices, mount points, and filesystems for use inside a
// new mount namespace.
func InitializeMountNamespace(rootfs, console string, sysReadonly bool, mountConfig *MountConfig) error {
	var (
		err  error
		flag = syscall.MS_PRIVATE
	)

	if mountConfig.NoPivotRoot {
		flag = syscall.MS_SLAVE   ///容器中的mount事件不会传播到init ns
	}

	if err := syscall.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil { ///将/设置为private,与init ns完全隔离
		return fmt.Errorf("mounting / with flags %X %s", (flag | syscall.MS_REC), err)
	}

	if err := syscall.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
		return fmt.Errorf("mouting %s as bind %s", rootfs, err)
	}
///...

Reference

]]>
TCP SYN cookies make window size suddenly becomes smaller 2017-03-03T20:00:30+00:00 hustcat http://hustcat.github.io/tcp_syn_cookies_and_window_size 问题

最近,业务反映了一个TCP连接容窗口突然异常变小的问题,引起数据传输速度异常之慢,如下:

在server端回SYN-ACK包,窗口大小还是144800,在server端确认client的第一个数据包时,一下子变成了60,但数据包的长度只有86个字节。

经过和几位同事的一起各种定位,最终发现是TCP SYN cookies引起的。简单总结一下,以示后人。

TCP引入SYN cookies是为了解决SYN flood问题。

SYN cookie is a technique used to resist SYN flood attacks.The technique’s primary inventor Daniel J. Bernstein defines SYN cookies as “particular choices of initial TCP sequence numbers by TCP servers.” In particular, the use of SYN cookies allows a server to avoid dropping connections when the SYN queue fills up. Instead, the server behaves as if the SYN queue had been enlarged. The server sends back the appropriate SYN+ACK response to the client but discards the SYN queue entry. If the server then receives a subsequent ACK response from the client, the server is able to reconstruct the SYN queue entry using information encoded in the TCP sequence number.

内核参数

  • sysctl_max_syn_backlog

sysctl_max_syn_backlog控制Listen Socket的半连接(SYN_RECV)队列长度:

struct inet_connection_sock {

	struct request_sock_queue icsk_accept_queue; ////SYN_RECV sockets queue
}

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
	struct inet_sock *inet = inet_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
///...
	sk->sk_state = TCP_LISTEN;
}

int reqsk_queue_alloc(struct request_sock_queue *queue,
		      unsigned int nr_table_entries)
{ 
///...
	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
	
	for (lopt->max_qlen_log = 3;
	     (1 << lopt->max_qlen_log) < nr_table_entries;
	     lopt->max_qlen_log++);
///...
}

static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
	return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
}

内核会根据listen的backlog和sysctl_max_syn_backlog计算listen socket的SYN queue的长度。如果队列满了,就会输出下面的日志:

TCP: TCP: Possible SYN flooding on port 6000. Dropping request.  Check SNMP counters.
  • sysctl_tcp_syncookies

控制是否启动TCP SYN cookies机制。

extern int sysctl_tcp_syncookies;

TCP处理新建连接的逻辑

当接收端收到发送端的SYN包之后,会创建一个request_sock,再给发送端返回SYN/ACK包后,将request_sock加入到LISTEN socket的SYN table:

tcp_v4_do_rcv(TCP_LISTEN) -> tcp_rcv_state_process -> tcp_v4_conn_request:

///ipv4_specific, LISTEN socket handle SYN packet
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
        struct request_sock *req;
///...
	/* TW buckets are converted to open requests without
	 * limitations, they conserve resources and peer is
	 * evidently real one.
	 */
	if (inet_csk_reqsk_queue_is_full(sk) && !isn) { ///SYN queue is full
		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
		if (!want_cookie) ///no tcp_syncookies, drop SKB
			goto drop;
	}

	/* Accept backlog is full. If we have already queued enough
	 * of warm entries in syn queue, drop request. It is better than
	 * clogging syn queue with openreqs with exponentially increasing
	 * timeout.
	 */
	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
		goto drop;
	}
	
	req = inet_reqsk_alloc(&tcp_request_sock_ops);
	if (!req)
		goto drop;
///...
	if (likely(!do_fastopen)) {
		int err;
		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, ///send SYN/ACK
		     ireq->rmt_addr, ireq->opt);
		err = net_xmit_eval(err);
		if (err || want_cookie) ///tcp_syncookies, don't add to SYN queue
			goto drop_and_free;

		tcp_rsk(req)->snt_synack = tcp_time_stamp;
		tcp_rsk(req)->listener = NULL;
		/* Add the request_sock to the SYN table */
		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); ///Add SYN table
		if (fastopen_cookie_present(&foc) && foc.len != 0)
			NET_INC_STATS_BH(sock_net(sk),
			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) ///fast open
		goto drop_and_free;
///...
}

当接收端再次收到发送端的ACK包时,内核会从SYN table找到与之对应的tcp_check_req,然后创建新的socket,至此,TCP连接算是完成建立(TCP_ESTABLISHED): tcp_v4_do_rcv(TCP_LISTEN) -> tcp_v4_hnd_req -> tcp_check_req:

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
	struct tcphdr *th = tcp_hdr(skb);
	const struct iphdr *iph = ip_hdr(skb);
	struct sock *nsk;
	struct request_sock **prev;
	/* Find possible connection requests. */
	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
						       iph->saddr, iph->daddr); ///get request_sock from SYN table
	if (req)
		return tcp_check_req(sk, skb, req, prev, false); /// create new socket
///...
}

SYN cookies

在没有开启tcp_syncookies选项时,如果LISTEN socket的SYN queue满之后,会直接丢掉SKB:

///ipv4_specific, LISTEN socket handle SYN packet
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
///..
	if (inet_csk_reqsk_queue_is_full(sk) && !isn) { ///SYN queue is full
		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
		if (!want_cookie) ///no tcp_syncookies, drop SKB
			goto drop;
	}

开启tcp_syncookies之后,如果LISTEN socket的SYN queue满之后,会创建request_sock,再返给对端SYN/ACK后,并不会将request_sock对象加到SYN queue,而是将其释放:

	if (likely(!do_fastopen)) {
		int err;
		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, ///send SYN/ACK
		     ireq->rmt_addr, ireq->opt);
		err = net_xmit_eval(err);
		if (err || want_cookie) ///tcp_syncookies, don't add to SYN queue
			goto drop_and_free;

这样,当收到对端的ACK后,tcp_v4_hnd_req从SYN queue找不到对应的request_sock对象,就会进入syncookies的处理逻辑: tcp_v4_do_rcv -> tcp_v4_hnd_req:

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
///...
#ifdef CONFIG_SYN_COOKIES
	if (!th->syn)
		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
	return sk;
}

cookie_v4_check会检查cookies是否有效,并创建新的request_sock对象,进入正常连接的流程。

SYN cookies与TCP options

对于走SYN cookies逻辑的连接,由于内核没有保存相关socket的状态,所以,SYN包中携带的TCP options就会丢失。

  • MSS

接收端在向发送端返回cookies时,会将MSS的值编码到cookies,发送端在返回cookies后,接收端调用cookie_v4_check获取MSS的值:

struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
			     struct ip_options *opt)
{
///...
	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
		goto out;

	if (tcp_synq_no_recent_overflow(sk) ||
	    (mss = cookie_check(skb, cookie)) == 0) { ///mss option value
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
		goto out;
	}

///...
	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
	if (!req)
		goto out;
///...
	/* Try to redo what tcp_v4_send_synack did. */
	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
	///initial window
	tcp_select_initial_window(tcp_full_space(sk), req->mss,
				  &req->rcv_wnd, &req->window_clamp,
				  ireq->wscale_ok, &rcv_wscale,
				  dst_metric(&rt->dst, RTAX_INITRWND));

	ireq->rcv_wscale  = rcv_wscale;
///...
}
  • wscale

但是,对于其它option,比如wscaleSACK等信息,就会丢失。后来,又使用timestamp来保存wscale,后来又取消了,参考12。详细参考Improving syncookies

对于TCP SYN cookies的处理逻辑,接收端在收到对端的ACK之后,会重新计算wscale,而不是TCP在建立连接的SYN/SYN-ACK过程协商的wscale,由于wscale的计算受recv buffer等参数的影响,会导致第二次计算的wscale与前面协商的不一致,从而导致发送端和接收端的wscale不一致:

void tcp_select_initial_window(int __space, __u32 mss,
			       __u32 *rcv_wnd, __u32 *window_clamp,
			       int wscale_ok, __u8 *rcv_wscale,
			       __u32 init_rcv_wnd)
{
	unsigned int space = (__space < 0 ? 0 : __space); ///sk_rcvbuf size
///...
	(*rcv_wscale) = 0;
	if (wscale_ok) {
		/* Set window scaling on max possible window
		 * See RFC1323 for an explanation of the limit to 14
		 */
		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
		space = min_t(u32, space, *window_clamp);
		while (space > 65535 && (*rcv_wscale) < 14) {
			space >>= 1;
			(*rcv_wscale)++;
		}
	}

而TCP的窗口大小,是受到wscale影响的,从而就会导致出现开头的问题。

总结

这本来是一个很简单的问题,但定位过程却走了不少弯路,从一开始就聚焦于TCP窗口机制,企图从中找问题,而忽略了内核的一些关键输出。再次说明了那个问题:* 表面上复杂的问题,背后的原因都非常简单!*

不管怎样,目前内核的TCP SYN cookies机制是有缺陷的,请慎用。

Reference

]]>
Dive deep into inotify and overlayfs 2017-01-06T11:00:30+00:00 hustcat http://hustcat.github.io/dive-into-inotify-and-overlayfs Introduction

应用层可以使用内核提供的文件系统通知API来获取文件系统中发生的变化,比如打开、关闭、创建、删除文件(夹)等。内核最开始在2.4.0中实现了dnotify,但dnotify重用了fcntl系统调用,有很多问题,比如:(1)dnotify只能监控文件夹,不能监控某个文件;(2)使用信号SIGIO来向进程传递事件,但信号是异步的,可能丢失,而且传递的信息太少,比如,无法知道到底是文件夹的哪个文件发生的事件。

后面,内核在2.6.13实现了inotifyinotify实现了几个新的系统调用,解决了dnotify的问题。

  • inotifywait

我们可以使用inotify-tools中自带的inotifywait来监控某个目录的事件。

#inotifywait -rme modify,open,create,delete,close /root/dbyin/test/
Setting up watches.  Beware: since -r was given, this may take a while!
Watches established.
/root/dbyin/test/ CREATE f1.txt
/root/dbyin/test/ OPEN f1.txt
/root/dbyin/test/ MODIFY f1.txt
/root/dbyin/test/ CLOSE_WRITE,CLOSE f1.txt
/root/dbyin/test/ DELETE f1.txt

Another terminal:

#echo hello > /root/dbyin/test/f1.txt
#rm /root/dbyin/test/f1.txt

程序示例参考这里

Inotify的实现

核心数据结构

  • fsnotify_group

fsnotify_group代表一个inotify实例,每次应用层调用inotify_init就会创建一个实例,它维护该实例的所有event信息:

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {

	const struct fsnotify_ops *ops;	/* how this group handles things, inotify_fops */
	
	struct list_head notification_list;	/* list of event_holder this group needs to send to userspace, fsnotify_event list */
	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
	unsigned int q_len;			/* events on the queue */
	unsigned int max_events;		/* maximum events allowed on the list */

	struct list_head marks_list;	/* all inode marks for this group, struct fsnotify_mark list */

	struct fasync_struct    *fsn_fa;    /* async notification */

	/* groups can define private fields here or use the void *private */
	union {
		void *private;
#ifdef CONFIG_INOTIFY_USER
		struct inotify_group_private_data {
			spinlock_t	idr_lock;
			struct idr      idr;   ///id -> inotify_inode_mark*
			struct user_struct      *user;
		} inotify_data; ///for inotify
#endif
	}
}
  • fsnotify_mark

fsnotify_mark是联系fsnotify_groupinode的桥梁,fsnotify_group->marks_listfsnotify_mark链表,fsnotify_mark.i->inode指向被监听文件的inode。inode->i_fsnotify_marks保存监听该inode的所有inotify实例。

struct inotify_inode_mark {
	struct fsnotify_mark fsn_mark;
	int wd; ///watch descriptor
};


struct fsnotify_mark {
	__u32 mask;			/* mask this mark is for */
	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
	 * in kernel that found and may be using this mark. */
	atomic_t refcnt;		/* active things looking at this mark */
	struct fsnotify_group *group;	/* group this mark is for */
	struct list_head g_list;	/* list of marks by group->i_fsnotify_marks */
	spinlock_t lock;		/* protect group and inode */
	union {
		struct fsnotify_inode_mark i;
		struct fsnotify_vfsmount_mark m;
	};
	__u32 ignored_mask;		/* events types to ignore */
#define FSNOTIFY_MARK_FLAG_INODE		0x01
#define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
#define FSNOTIFY_MARK_FLAG_ALIVE		0x10
	unsigned int flags;		/* vfsmount or inode mark? */
	struct list_head destroy_list;
	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
};

/*
 * Inode specific fields in an fsnotify_mark
 */
struct fsnotify_inode_mark {
	struct inode *inode;		/* inode this mark is associated with */
	struct hlist_node i_list;	/* list of marks by inode->i_fsnotify_marks */
	struct list_head free_i_list;	/* tmp list used when freeing this mark */
};
  • inode and file
/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {

#ifdef CONFIG_FSNOTIFY
	__u32			i_fsnotify_mask; /* all events this inode cares about */
	struct hlist_head	i_fsnotify_marks; ///struct fsnotify_inode_mark list, see fsnotify_inode_mark.i_list
#endif 
}


struct file {
    ///...
	
	void			*private_data; ///fsnotify_group*
}

Overlayfs的实现

  • 数据结构

Overlayfs的几个关键数据结构:

struct dentry {
	struct dentry *d_parent;	/* parent directory,父目录dentry对象 */
	struct qstr d_name;   ///当前分量的名称
	struct inode *d_inode;		/* inode对象, create by ovl_new_inode */
	
	const struct dentry_operations *d_op; /// == super_block->s_d_op == ovl_dentry_operations
	struct super_block *d_sb;	/* The root of the dentry tree */

	void *d_fsdata;			/* fs-specific data, struct ovl_entry */
}


/* private information held for every overlayfs dentry */
struct ovl_entry {
	struct dentry *__upperdentry; ///not NULL if got in upperdir
	struct ovl_dir_cache *cache;
	union {
		struct {
			u64 version;
			bool opaque;
		};
		struct rcu_head rcu;
	};
	unsigned numlower;
	struct path lowerstack[]; ///not NULL if got in lowdir
};


struct inode {
	const struct inode_operations	*i_op; ///ovl_dir_inode_operations
	struct super_block	*i_sb;
	
	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops, ovl_dir_operations */
	
	void			*i_private; /* fs or device private pointer,  struct ovl_entry*/
};

dentry是内核的目录项对象,每个目录(文件)都有一个对应的对象,对于overlayfs的每个dentry的指向的inode并没有实际的磁盘数据,而是由ovl_new_inode创建的一个内存inode;dentry->d_fsdata指向ovl_entry,而后者指向真正的underlay fs的dentry。

在overlayfs遍历时,dentry->inode并没有多大用,实际上,在ovl_lookup中,代表父目录的inode参数struct inode *dir并没有没使用到。而dentry->d_fsdata指向ovl_entry才是进行查找的关键因素,通过ovl_entry进入到underlay fs的查找。

///dir: parent directory inode object, dentry: dentry object for current finding dircotry entry
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
			  unsigned int flags) ///called by lookup_real
{
	struct ovl_entry *oe;
	struct ovl_entry *poe = dentry->d_parent->d_fsdata; ///dentry->d_parent->d_inode == dir
	struct path *stack = NULL;
	struct dentry *upperdir, *upperdentry = NULL;
	unsigned int ctr = 0;
	struct inode *inode = NULL;
	bool upperopaque = false;
	struct dentry *this, *prev = NULL;
	unsigned int i;
	int err;

	upperdir = ovl_upperdentry_dereference(poe);
	if (upperdir) { ///(1)lookup in upperdir firstly
		this = ovl_lookup_real(upperdir, &dentry->d_name);
		err = PTR_ERR(this);
		if (IS_ERR(this))
			goto out;

		if (this) {///exist in upperdir
			if (unlikely(ovl_dentry_remote(this))) {
				dput(this);
				err = -EREMOTE;
				goto out;
			}
			if (ovl_is_whiteout(this)) {
				dput(this); ///whiteout file
				this = NULL;
				upperopaque = true;
			} else if (poe->numlower && ovl_is_opaquedir(this)) {
				upperopaque = true; ///opaque dir
			}
		}
		upperdentry = prev = this;
	}
	///(2)didn't find dentry in upperdir
	if (!upperopaque && poe->numlower) {
		err = -ENOMEM;
		stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
		if (!stack)
			goto out_put_upper;
	}
	///(3)find dentry in lowdir
	for (i = 0; !upperopaque && i < poe->numlower; i++) {
		bool opaque = false;
		struct path lowerpath = poe->lowerstack[i];

		this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
		err = PTR_ERR(this);
		if (IS_ERR(this)) {
			/*
			 * If it's positive, then treat ENAMETOOLONG as ENOENT.
			 */
			if (err == -ENAMETOOLONG && (upperdentry || ctr))
				continue;
			goto out_put;
		}
		if (!this)
			continue;
		if (ovl_is_whiteout(this)) {
			dput(this);
			break;
		}
		/*
		 * Only makes sense to check opaque dir if this is not the
		 * lowermost layer.
		 */
		if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
			opaque = true;

		if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
			     !S_ISDIR(this->d_inode->i_mode))) {
			/*
			 * FIXME: check for upper-opaqueness maybe better done
			 * in remove code.
			 */
			if (prev == upperdentry)
				upperopaque = true;
			dput(this);
			break;
		}
		/*
		 * If this is a non-directory then stop here.
		 */
		if (!S_ISDIR(this->d_inode->i_mode))
			opaque = true;

		stack[ctr].dentry = this;
		stack[ctr].mnt = lowerpath.mnt;
		ctr++;
		prev = this;
		if (opaque)
			break;
	}

	oe = ovl_alloc_entry(ctr); ///ovl_dentry for current finding dentry
	err = -ENOMEM;
	if (!oe)
		goto out_put;

	if (upperdentry || ctr) {///if got in upperdir, upperdentry != NULL; else if got in lowdir, ctr > 0
		struct dentry *realdentry;

		realdentry = upperdentry ? upperdentry : stack[0].dentry;
		///alloc overlayfs inode for current real inode
		err = -ENOMEM;
		inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
				      oe);
		if (!inode)
			goto out_free_oe;
		ovl_copyattr(realdentry->d_inode, inode);
	}

	oe->opaque = upperopaque;
	oe->__upperdentry = upperdentry;
	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
	kfree(stack);
	dentry->d_fsdata = oe; ///ovl_entry
	d_add(dentry, inode);

	return NULL;

out_free_oe:
	kfree(oe);
out_put:
	for (i = 0; i < ctr; i++)
		dput(stack[i].dentry);
	kfree(stack);
out_put_upper:
	dput(upperdentry);
out:
	return ERR_PTR(err);
}
  • open and copy up

overlayfs在打开文件时,会让struct file->f_inode指向real inode;而且,如果会修改文件,且upperdir不存在该文件,则会从lowerdir进行copy up:

int vfs_open(const struct path *path, struct file *file,
            const struct cred *cred)
{
	struct dentry *dentry = path->dentry; ///overlayfs dentry
	struct inode *inode = dentry->d_inode; ///overlayfs inode

	file->f_path = *path;
	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
		inode = dentry->d_op->d_select_inode(dentry, file->f_flags); ///get real inode, ovl_dentry_operations
		if (IS_ERR(inode))
			return PTR_ERR(inode);
	}

	return do_dentry_open(file, inode, NULL, cred); ///file->f_inode = inode
}

///return underlay fs inode
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
{
	int err;
	struct path realpath;
	enum ovl_path_type type;

	if (S_ISDIR(dentry->d_inode->i_mode))
		return dentry->d_inode;

	type = ovl_path_real(dentry, &realpath); ///real dentry
	if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { ///need copy up
		err = ovl_want_write(dentry);
		if (err)
			return ERR_PTR(err);

		if (file_flags & O_TRUNC)
			err = ovl_copy_up_truncate(dentry);
		else
			err = ovl_copy_up(dentry); ///copy up
		ovl_drop_write(dentry);
		if (err)
			return ERR_PTR(err);

		ovl_path_upper(dentry, &realpath);
	}

	if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
		return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);

	return realpath.dentry->d_inode; ///return real inode
}

Inotify and Overlayfs

inotify_add_watch使用的是overlayfs inode:

SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
		u32, mask)
{

///...
	ret = inotify_find_inode(pathname, &path, flags); ///返回overlayfs inode
	if (ret)
		goto fput_and_out;

	/* inode held in place by reference to path; group by fget on fd */
	inode = path.dentry->d_inode; ///monitored file(overlay inode)
	group = f.file->private_data; ///notify group

	/* create/update an inode mark */
	ret = inotify_update_watch(group, inode, mask);

///...
}

fsnotify_open使用的是underlayfs inode:

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
	struct path *path = &file->f_path;
	struct inode *inode = file_inode(file); ///for overlayfs , after vfs_open, f->f_inode == underlay inode
	__u32 mask = FS_OPEN;

	if (S_ISDIR(inode->i_mode))
		mask |= FS_ISDIR;

	fsnotify_parent(path, NULL, mask);
	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
}

vfs_open中,内核会将file->f_inode指向underlayfs inode:

int vfs_open(const struct path *path, struct file *file,
            const struct cred *cred)
{
	struct dentry *dentry = path->dentry; ///overlayfs dentry
	struct inode *inode = dentry->d_inode; ///overlayfs inode

	file->f_path = *path;
	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
		inode = dentry->d_op->d_select_inode(dentry, file->f_flags); ///get underlayfs inode, ovl_dentry_operations
		if (IS_ERR(inode))
			return PTR_ERR(inode);
	}

	return do_dentry_open(file, inode, NULL, cred); ///file->f_inode = inode
}

所以,对单个文件进行watch时,无法得到事件。

Reference

]]>