3497f17497SC.J. Collier
3597f17497SC.J. Collier#include <stdint.h>
3697f17497SC.J. Collier#include <rte_ethdev.h>
3797f17497SC.J. Collier#include <rte_malloc.h>
3897f17497SC.J. Collier
3997f17497SC.J. Collier#include "base/i40e_prototype.h"
4097f17497SC.J. Collier#include "base/i40e_type.h"
4197f17497SC.J. Collier#include "i40e_ethdev.h"
4297f17497SC.J. Collier#include "i40e_rxtx.h"
436b3e017eSChristian Ehrhardt#include "i40e_rxtx_vec_common.h"
4497f17497SC.J. Collier
456b3e017eSChristian Ehrhardt#include <arm_neon.h>
4697f17497SC.J. Collier
4797f17497SC.J. Collier#pragma GCC diagnostic ignored "-Wcast-qual"
4897f17497SC.J. Collier
4997f17497SC.J. Collierstatic inline void
5097f17497SC.J. Collieri40e_rxq_rearm(struct i40e_rx_queue *rxq)
5197f17497SC.J. Collier{
5297f17497SC.J. Collier	int i;
5397f17497SC.J. Collier	uint16_t rx_id;
5497f17497SC.J. Collier	volatile union i40e_rx_desc *rxdp;
5597f17497SC.J. Collier	struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
5697f17497SC.J. Collier	struct rte_mbuf *mb0, *mb1;
576b3e017eSChristian Ehrhardt	uint64x2_t dma_addr0, dma_addr1;
586b3e017eSChristian Ehrhardt	uint64x2_t zero = vdupq_n_u64(0);
596b3e017eSChristian Ehrhardt	uint64_t paddr;
606b3e017eSChristian Ehrhardt	uint8x8_t p;
6197f17497SC.J. Collier
6297f17497SC.J. Collier	rxdp = rxq->rx_ring + rxq->rxrearm_start;
6397f17497SC.J. Collier
6497f17497SC.J. Collier	/* Pull 'n' more MBUFs into the software ring */
656b3e017eSChristian Ehrhardt	if (unlikely(rte_mempool_get_bulk(rxq->mp,
666b3e017eSChristian Ehrhardt					  (void *)rxep,
676b3e017eSChristian Ehrhardt					  RTE_I40E_RXQ_REARM_THRESH) < 0)) {
6897f17497SC.J. Collier		if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
6997f17497SC.J. Collier		    rxq->nb_rx_desc) {
7097f17497SC.J. Collier			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
7197f17497SC.J. Collier				rxep[i].mbuf = &rxq->fake_mbuf;
726b3e017eSChristian Ehrhardt				vst1q_u64((uint64_t *)&rxdp[i].read, zero);
7397f17497SC.J. Collier			}
7497f17497SC.J. Collier		}
7597f17497SC.J. Collier		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
7697f17497SC.J. Collier			RTE_I40E_RXQ_REARM_THRESH;
7797f17497SC.J. Collier		return;
7897f17497SC.J. Collier	}
7997f17497SC.J. Collier
806b3e017eSChristian Ehrhardt	p = vld1_u8((uint8_t *)&rxq->mbuf_initializer);
816b3e017eSChristian Ehrhardt
8297f17497SC.J. Collier	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
8397f17497SC.J. Collier	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
8497f17497SC.J. Collier		mb0 = rxep[0].mbuf;
8597f17497SC.J. Collier		mb1 = rxep[1].mbuf;
8697f17497SC.J. Collier
8797f17497SC.J. Collier		 /* Flush mbuf with pkt template.
8897f17497SC.J. Collier		 * Data to be rearmed is 6 bytes long.
8997f17497SC.J. Collier		 * Though, RX will overwrite ol_flags that are coming next
9097f17497SC.J. Collier		 * anyway. So overwrite whole 8 bytes with one load:
9197f17497SC.J. Collier		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
9297f17497SC.J. Collier		 */
936b3e017eSChristian Ehrhardt		vst1_u8((uint8_t *)&mb0->rearm_data, p);
946b3e017eSChristian Ehrhardt		paddr = mb0->buf_physaddr + RTE_PKTMBUF_HEADROOM;
956b3e017eSChristian Ehrhardt		dma_addr0 = vdupq_n_u64(paddr);
9697f17497SC.J. Collier
9797f17497SC.J. Collier		/* flush desc with pa dma_addr */
986b3e017eSChristian Ehrhardt		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
996b3e017eSChristian Ehrhardt
1006b3e017eSChristian Ehrhardt		vst1_u8((uint8_t *)&mb1->rearm_data, p);
1016b3e017eSChristian Ehrhardt		paddr = mb1->buf_physaddr + RTE_PKTMBUF_HEADROOM;
1026b3e017eSChristian Ehrhardt		dma_addr1 = vdupq_n_u64(paddr);
1036b3e017eSChristian Ehrhardt		vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
10497f17497SC.J. Collier	}
10597f17497SC.J. Collier
10697f17497SC.J. Collier	rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
10797f17497SC.J. Collier	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
10897f17497SC.J. Collier		rxq->rxrearm_start = 0;
10997f17497SC.J. Collier
11097f17497SC.J. Collier	rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
11197f17497SC.J. Collier
11297f17497SC.J. Collier	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
11397f17497SC.J. Collier			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
11497f17497SC.J. Collier
11597f17497SC.J. Collier	/* Update the tail pointer on the NIC */
11697f17497SC.J. Collier	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
11797f17497SC.J. Collier}
11897f17497SC.J. Collier
11997f17497SC.J. Collier/* Handling the offload flags (olflags) field takes computation
12097f17497SC.J. Collier * time when receiving packets. Therefore we provide a flag to disable
12197f17497SC.J. Collier * the processing of the olflags field when they are not needed. This
12297f17497SC.J. Collier * gives improved performance, at the cost of losing the offload info
12397f17497SC.J. Collier * in the received packet
12497f17497SC.J. Collier */
12597f17497SC.J. Collier#ifdef RTE_LIBRTE_I40E_RX_OLFLAGS_ENABLE
12697f17497SC.J. Collier
12797f17497SC.J. Collierstatic inline void
1286b3e017eSChristian Ehrhardtdesc_to_olflags_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts)
12997f17497SC.J. Collier{
1306b3e017eSChristian Ehrhardt	uint32x4_t vlan0, vlan1, rss, l3_l4e;
13197f17497SC.J. Collier
1328b25d1adSChristian Ehrhardt	/* mask everything except RSS, flow director and VLAN flags
1338b25d1adSChristian Ehrhardt	 * bit2 is for VLAN tag, bit11 for flow director indication
1348b25d1adSChristian Ehrhardt	 * bit13:12 for RSS indication.
1358b25d1adSChristian Ehrhardt	 */
1366b3e017eSChristian Ehrhardt	const uint32x4_t rss_vlan_msk = {
1376b3e017eSChristian Ehrhardt			0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804};
13897f17497SC.J. Collier
13997f17497SC.J. Collier	/* map rss and vlan type to rss hash and vlan flag */
1406b3e017eSChristian Ehrhardt	const uint8x16_t vlan_flags = {
1416b3e017eSChristian Ehrhardt			0, 0, 0, 0,
1426b3e017eSChristian Ehrhardt			PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 0, 0, 0,
14397f17497SC.J. Collier			0, 0, 0, 0,
1446b3e017eSChristian Ehrhardt			0, 0, 0, 0};
14597f17497SC.J. Collier
1466b3e017eSChristian Ehrhardt	const uint8x16_t rss_flags = {
1476b3e017eSChristian Ehrhardt			0, PKT_RX_FDIR, 0, 0,
1486b3e017eSChristian Ehrhardt			0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR,
14997f17497SC.J. Collier			0, 0, 0, 0,
1506b3e017eSChristian Ehrhardt			0, 0, 0, 0};
1516b3e017eSChristian Ehrhardt
1526b3e017eSChristian Ehrhardt	const uint8x16_t l3_l4e_flags = {
1536b3e017eSChristian Ehrhardt			0,
1546b3e017eSChristian Ehrhardt			PKT_RX_IP_CKSUM_BAD,
1556b3e017eSChristian Ehrhardt			PKT_RX_L4_CKSUM_BAD,
1566b3e017eSChristian Ehrhardt			PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD,
1576b3e017eSChristian Ehrhardt			PKT_RX_EIP_CKSUM_BAD,
1586b3e017eSChristian Ehrhardt			PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD,
1596b3e017eSChristian Ehrhardt			PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD,
1606b3e017eSChristian Ehrhardt			PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD,
1616b3e017eSChristian Ehrhardt			0, 0, 0, 0, 0, 0, 0, 0};
1626b3e017eSChristian Ehrhardt
1636b3e017eSChristian Ehrhardt	vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),
1646b3e017eSChristian Ehrhardt			  vreinterpretq_u32_u64(descs[2])).val[1];
1656b3e017eSChristian Ehrhardt	vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]),
1666b3e017eSChristian Ehrhardt			  vreinterpretq_u32_u64(descs[3])).val[1];
1676b3e017eSChristian Ehrhardt	vlan0 = vzipq_u32(vlan0, vlan1).val[0];
1686b3e017eSChristian Ehrhardt
1696b3e017eSChristian Ehrhardt	vlan1 = vandq_u32(vlan0, rss_vlan_msk);
1706b3e017eSChristian Ehrhardt	vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags,
1716b3e017eSChristian Ehrhardt						vreinterpretq_u8_u32(vlan1)));
1726b3e017eSChristian Ehrhardt
1736b3e017eSChristian Ehrhardt	rss = vshrq_n_u32(vlan1, 11);
1746b3e017eSChristian Ehrhardt	rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags,
1756b3e017eSChristian Ehrhardt					      vreinterpretq_u8_u32(rss)));
1766b3e017eSChristian Ehrhardt
1776b3e017eSChristian Ehrhardt	l3_l4e = vshrq_n_u32(vlan1, 22);
1786b3e017eSChristian Ehrhardt	l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags,
1796b3e017eSChristian Ehrhardt					      vreinterpretq_u8_u32(l3_l4e)));
1806b3e017eSChristian Ehrhardt
1816b3e017eSChristian Ehrhardt
1826b3e017eSChristian Ehrhardt	vlan0 = vorrq_u32(vlan0, rss);
1836b3e017eSChristian Ehrhardt	vlan0 = vorrq_u32(vlan0, l3_l4e);
1846b3e017eSChristian Ehrhardt
1856b3e017eSChristian Ehrhardt	rx_pkts[0]->ol_flags = vgetq_lane_u32(vlan0, 0);
1866b3e017eSChristian Ehrhardt	rx_pkts[1]->ol_flags = vgetq_lane_u32(vlan0, 1);
1876b3e017eSChristian Ehrhardt	rx_pkts[2]->ol_flags = vgetq_lane_u32(vlan0, 2);
1886b3e017eSChristian Ehrhardt	rx_pkts[3]->ol_flags = vgetq_lane_u32(vlan0, 3);
1896b3e017eSChristian Ehrhardt}
1906b3e017eSChristian Ehrhardt#else
1916b3e017eSChristian Ehrhardt#define desc_to_olflags_v(descs, rx_pkts) do {} while (0)
1926b3e017eSChristian Ehrhardt#endif
19397f17497SC.J. Collier
1946b3e017eSChristian Ehrhardt#define PKTLEN_SHIFT     10
19547d9763aSLuca Boccassi#define I40E_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
19697f17497SC.J. Collier
1976b3e017eSChristian Ehrhardtstatic inline void
1986b3e017eSChristian Ehrhardtdesc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts)
1996b3e017eSChristian Ehrhardt{
2006b3e017eSChristian Ehrhardt	int i;
2016b3e017eSChristian Ehrhardt	uint8_t ptype;
2026b3e017eSChristian Ehrhardt	uint8x16_t tmp;
20397f17497SC.J. Collier
2046b3e017eSChristian Ehrhardt	for (i = 0; i < 4; i++) {
2056b3e017eSChristian Ehrhardt		tmp = vreinterpretq_u8_u64(vshrq_n_u64(descs[i], 30));
2066b3e017eSChristian Ehrhardt		ptype = vgetq_lane_u8(tmp, 8);
207aab0c291SChristian Ehrhardt		rx_pkts[i]->packet_type = i40e_rxd_pkt_type_mapping(ptype);
2086b3e017eSChristian Ehrhardt	}
20997f17497SC.J. Collier
21097f17497SC.J. Collier}
21197f17497SC.J. Collier
21297f17497SC.J. Collier /*
21397f17497SC.J. Collier * Notice:
21497f17497SC.J. Collier * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
21597f17497SC.J. Collier * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
21697f17497SC.J. Collier *   numbers of DD bits
21797f17497SC.J. Collier */
21897f17497SC.J. Collierstatic inline uint16_t
21997f17497SC.J. Collier_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
22097f17497SC.J. Collier		   uint16_t nb_pkts, uint8_t *split_packet)
22197f17497SC.J. Collier{
22297f17497SC.J. Collier	volatile union i40e_rx_desc *rxdp;
22397f17497SC.J. Collier	struct i40e_rx_entry *sw_ring;
22497f17497SC.J. Collier	uint16_t nb_pkts_recd;
22597f17497SC.J. Collier	int pos;
22697f17497SC.J. Collier
2276b3e017eSChristian Ehrhardt	/* mask to shuffle from desc. to mbuf */
2286b3e017eSChristian Ehrhardt	uint8x16_t shuf_msk = {
2296b3e017eSChristian Ehrhardt		0xFF, 0xFF,   /* pkt_type set as unknown */
2306b3e017eSChristian Ehrhardt		0xFF, 0xFF,   /* pkt_type set as unknown */
2316b3e017eSChristian Ehrhardt		14, 15,       /* octet 15~14, low 16 bits pkt_len */
2326b3e017eSChristian Ehrhardt		0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
2336b3e017eSChristian Ehrhardt		14, 15,       /* octet 15~14, 16 bits data_len */
2346b3e017eSChristian Ehrhardt		2, 3,         /* octet 2~3, low 16 bits vlan_macip */
2356b3e017eSChristian Ehrhardt		4, 5, 6, 7    /* octet 4~7, 32bits rss */
2366b3e017eSChristian Ehrhardt		};
2376b3e017eSChristian Ehrhardt
2386b3e017eSChristian Ehrhardt	uint8x16_t eop_check = {
2396b3e017eSChristian Ehrhardt		0x02, 0x00, 0x02, 0x00,
2406b3e017eSChristian Ehrhardt		0x02, 0x00, 0x02, 0x00,
2416b3e017eSChristian Ehrhardt		0x00, 0x00, 0x00, 0x00,
2426b3e017eSChristian Ehrhardt		0x00, 0x00, 0x00, 0x00
2436b3e017eSChristian Ehrhardt		};
2446b3e017eSChristian Ehrhardt
2456b3e017eSChristian Ehrhardt	uint16x8_t crc_adjust = {
2466b3e017eSChristian Ehrhardt		0, 0,         /* ignore pkt_type field */
2476b3e017eSChristian Ehrhardt		rxq->crc_len, /* sub crc on pkt_len */
2486b3e017eSChristian Ehrhardt		0,            /* ignore high-16bits of pkt_len */
2496b3e017eSChristian Ehrhardt		rxq->crc_len, /* sub crc on data_len */
2506b3e017eSChristian Ehrhardt		0, 0, 0       /* ignore non-length fields */
2516b3e017eSChristian Ehrhardt		};
25297f17497SC.J. Collier
25397f17497SC.J. Collier	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
25497f17497SC.J. Collier	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);
25597f17497SC.J. Collier
25697f17497SC.J. Collier	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
25797f17497SC.J. Collier	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
25897f17497SC.J. Collier
25997f17497SC.J. Collier	/* Just the act of getting into the function from the application is
26097f17497SC.J. Collier	 * going to cost about 7 cycles
26197f17497SC.J. Collier	 */
26297f17497SC.J. Collier	rxdp = rxq->rx_ring + rxq->rx_tail;
26397f17497SC.J. Collier
2646b3e017eSChristian Ehrhardt	rte_prefetch_non_temporal(rxdp);
26597f17497SC.J. Collier
26697f17497SC.J. Collier	/* See if we need to rearm the RX queue - gives the prefetch a bit
26797f17497SC.J. Collier	 * of time to act
26897f17497SC.J. Collier	 */
26997f17497SC.J. Collier	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
27097f17497SC.J. Collier		i40e_rxq_rearm(rxq);
27197f17497SC.J. Collier
27297f17497SC.J. Collier	/* Before we start moving massive data around, check to see if
27397f17497SC.J. Collier	 * there is actually a packet available
27497f17497SC.J. Collier	 */
27597f17497SC.J. Collier	if (!(rxdp->wb.qword1.status_error_len &
27697f17497SC.J. Collier			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
27797f17497SC.J. Collier		return 0;
27897f17497SC.J. Collier
27997f17497SC.J. Collier	/* Cache is empty -> need to scan the buffer rings, but first move
28097f17497SC.J. Collier	 * the next 'n' mbufs into the cache
28197f17497SC.J. Collier	 */
28297f17497SC.J. Collier	sw_ring = &rxq->sw_ring[rxq->rx_tail];
28397f17497SC.J. Collier
28497f17497SC.J. Collier	/* A. load 4 packet in one loop
28597f17497SC.J. Collier	 * [A*. mask out 4 unused dirty field in desc]
28697f17497SC.J. Collier	 * B. copy 4 mbuf point from swring to rx_pkts
28797f17497SC.J. Collier	 * C. calc the number of DD bits among the 4 packets
28897f17497SC.J. Collier	 * [C*. extract the end-of-packet bit, if requested]
28997f17497SC.J. Collier	 * D. fill info. from desc to mbuf
29097f17497SC.J. Collier	 */
29197f17497SC.J. Collier
2927b53c036SRicardo Salveti	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
29397f17497SC.J. Collier			pos += RTE_I40E_DESCS_PER_LOOP,
29497f17497SC.J. Collier			rxdp += RTE_I40E_DESCS_PER_LOOP) {
2956b3e017eSChristian Ehrhardt		uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP];
2966b3e017eSChristian Ehrhardt		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
2976b3e017eSChristian Ehrhardt		uint16x8x2_t sterr_tmp1, sterr_tmp2;
2986b3e017eSChristian Ehrhardt		uint64x2_t mbp1, mbp2;
2996b3e017eSChristian Ehrhardt		uint16x8_t staterr;
3006b3e017eSChristian Ehrhardt		uint16x8_t tmp;
3016b3e017eSChristian Ehrhardt		uint64_t stat;
3026b3e017eSChristian Ehrhardt
3036b3e017eSChristian Ehrhardt		int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT};
30497f17497SC.J. Collier
30597f17497SC.J. Collier		/* B.1 load 1 mbuf point */
3066b3e017eSChristian Ehrhardt		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
30797f17497SC.J. Collier		/* Read desc statuses backwards to avoid race condition */
30897f17497SC.J. Collier		/* A.1 load 4 pkts desc */
3096b3e017eSChristian Ehrhardt		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
3106b3e017eSChristian Ehrhardt		rte_rmb();
3116b3e017eSChristian Ehrhardt
31297f17497SC.J. Collier		/* B.2 copy 2 mbuf point into rx_pkts  */
3136b3e017eSChristian Ehrhardt		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
31497f17497SC.J. Collier
31597f17497SC.J. Collier		/* B.1 load 1 mbuf point */
3166b3e017eSChristian Ehrhardt		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
31797f17497SC.J. Collier
3186b3e017eSChristian Ehrhardt		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
31997f17497SC.J. Collier		/* B.1 load 2 mbuf point */
3206b3e017eSChristian Ehrhardt		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
3216b3e017eSChristian Ehrhardt		descs[0] =  vld1q_u64((uint64_t *)(rxdp));
32297f17497SC.J. Collier
32397f17497SC.J. Collier		/* B.2 copy 2 mbuf point into rx_pkts  */
3246b3e017eSChristian Ehrhardt		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
32597f17497SC.J. Collier
32697f17497SC.J. Collier		if (split_packet) {
3278b25d1adSChristian Ehrhardt			rte_mbuf_prefetch_part2(rx_pkts[pos]);
3288b25d1adSChristian Ehrhardt			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
3298b25d1adSChristian Ehrhardt			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
3308b25d1adSChristian Ehrhardt			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
33197f17497SC.J. Collier		}
33297f17497SC.J. Collier
33397f17497SC.J. Collier		/* avoid compiler reorder optimization */
33497f17497SC.J. Collier		rte_compiler_barrier();
33597f17497SC.J. Collier
3368b25d1adSChristian Ehrhardt		/* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
3376b3e017eSChristian Ehrhardt		uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
3386b3e017eSChristian Ehrhardt					    len_shl);
3396b3e017eSChristian Ehrhardt		descs[3] = vreinterpretq_u64_u32(len3);
3406b3e017eSChristian Ehrhardt		uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]),
3416b3e017eSChristian Ehrhardt					    len_shl);
3426b3e017eSChristian Ehrhardt		descs[2] = vreinterpretq_u64_u32(len2);
3438b25d1adSChristian Ehrhardt
34497f17497SC.J. Collier		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
3456b3e017eSChristian Ehrhardt		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
3466b3e017eSChristian Ehrhardt		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
34797f17497SC.J. Collier
34897f17497SC.J. Collier		/* C.1 4=>2 filter staterr info only */
3496b3e017eSChristian Ehrhardt		sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
3506b3e017eSChristian Ehrhardt				       vreinterpretq_u16_u64(descs[3]));
35197f17497SC.J. Collier		/* C.1 4=>2 filter staterr info only */
3526b3e017eSChristian Ehrhardt		sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
3536b3e017eSChristian Ehrhardt				       vreinterpretq_u16_u64(descs[2]));
3546b3e017eSChristian Ehrhardt
3556b3e017eSChristian Ehrhardt		/* C.2 get 4 pkts staterr value  */
3566b3e017eSChristian Ehrhardt		staterr = vzipq_u16(sterr_tmp1.val[1],
3576b3e017eSChristian Ehrhardt				    sterr_tmp2.val[1]).val[0];
35897f17497SC.J. Collier
35997f17497SC.J. Collier		desc_to_olflags_v(descs, &rx_pkts[pos]);
36097f17497SC.J. Collier
36197f17497SC.J. Collier		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
3626b3e017eSChristian Ehrhardt		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
3636b3e017eSChristian Ehrhardt		pkt_mb4 = vreinterpretq_u8_u16(tmp);
3646b3e017eSChristian Ehrhardt		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
3656b3e017eSChristian Ehrhardt		pkt_mb3 = vreinterpretq_u8_u16(tmp);
36697f17497SC.J. Collier
3678b25d1adSChristian Ehrhardt		/* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
3686b3e017eSChristian Ehrhardt		uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]),
3696b3e017eSChristian Ehrhardt					    len_shl);
3706b3e017eSChristian Ehrhardt		descs[1] = vreinterpretq_u64_u32(len1);
3716b3e017eSChristian Ehrhardt		uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]),
3726b3e017eSChristian Ehrhardt					    len_shl);
3736b3e017eSChristian Ehrhardt		descs[0] = vreinterpretq_u64_u32(len0);
3748b25d1adSChristian Ehrhardt
37597f17497SC.J. Collier		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
3766b3e017eSChristian Ehrhardt		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
3776b3e017eSChristian Ehrhardt		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
37897f17497SC.J. Collier
37997f17497SC.J. Collier		/* D.3 copy final 3,4 data to rx_pkts */
3806b3e017eSChristian Ehrhardt		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
38197f17497SC.J. Collier				 pkt_mb4);
3826b3e017eSChristian Ehrhardt		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
38397f17497SC.J. Collier				 pkt_mb3);
38497f17497SC.J. Collier
38597f17497SC.J. Collier		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
3866b3e017eSChristian Ehrhardt		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
3876b3e017eSChristian Ehrhardt		pkt_mb2 = vreinterpretq_u8_u16(tmp);
3886b3e017eSChristian Ehrhardt		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
3896b3e017eSChristian Ehrhardt		pkt_mb1 = vreinterpretq_u8_u16(tmp);
39097f17497SC.J. Collier
39197f17497SC.J. Collier		/* C* extract and record EOP bit */
39297f17497SC.J. Collier		if (split_packet) {
3936b3e017eSChristian Ehrhardt			uint8x16_t eop_shuf_mask = {
3946b3e017eSChristian Ehrhardt					0x00, 0x02, 0x04, 0x06,
39597f17497SC.J. Collier					0xFF, 0xFF, 0xFF, 0xFF,
39697f17497SC.J. Collier					0xFF, 0xFF, 0xFF, 0xFF,
3976b3e017eSChristian Ehrhardt					0xFF, 0xFF, 0xFF, 0xFF};
3986b3e017eSChristian Ehrhardt			uint8x16_t eop_bits;
39997f17497SC.J. Collier
40097f17497SC.J. Collier			/* and with mask to extract bits, flipping 1-0 */
4016b3e017eSChristian Ehrhardt			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
4026b3e017eSChristian Ehrhardt			eop_bits = vandq_u8(eop_bits, eop_check);
40397f17497SC.J. Collier			/* the staterr values are not in order, as the count
40497f17497SC.J. Collier			 * count of dd bits doesn't care. However, for end of
40597f17497SC.J. Collier			 * packet tracking, we do care, so shuffle. This also
40697f17497SC.J. Collier			 * compresses the 32-bit values to 8-bit
40797f17497SC.J. Collier			 */
4086b3e017eSChristian Ehrhardt			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
4096b3e017eSChristian Ehrhardt
41097f17497SC.J. Collier			/* store the resulting 32-bit value */
4116b3e017eSChristian Ehrhardt			vst1q_lane_u32((uint32_t *)split_packet,
4126b3e017eSChristian Ehrhardt				       vreinterpretq_u32_u8(eop_bits), 0);
41397f17497SC.J. Collier			split_packet += RTE_I40E_DESCS_PER_LOOP;
41497f17497SC.J. Collier
41597f17497SC.J. Collier			/* zero-out next pointers */
41697f17497SC.J. Collier			rx_pkts[pos]->next = NULL;
41797f17497SC.J. Collier			rx_pkts[pos + 1]->next = NULL;
41897f17497SC.J. Collier			rx_pkts[pos + 2]->next = NULL;
41997f17497SC.J. Collier			rx_pkts[pos + 3]->next = NULL;
42097f17497SC.J. Collier		}
42197f17497SC.J. Collier
42247d9763aSLuca Boccassi		staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1);
42347d9763aSLuca Boccassi		staterr = vreinterpretq_u16_s16(
42447d9763aSLuca Boccassi				vshrq_n_s16(vreinterpretq_s16_u16(staterr),
42547d9763aSLuca Boccassi					    I40E_UINT16_BIT - 1));
42647d9763aSLuca Boccassi		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
42747d9763aSLuca Boccassi
4286b3e017eSChristian Ehrhardt		rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP);
42997f17497SC.J. Collier
43097f17497SC.J. Collier		/* D.3 copy final 1,2 data to rx_pkts */
4316b3e017eSChristian Ehrhardt		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
4326b3e017eSChristian Ehrhardt			 pkt_mb2);
4336b3e017eSChristian Ehrhardt		vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1,
4346b3e017eSChristian Ehrhardt			 pkt_mb1);
4356b3e017eSChristian Ehrhardt		desc_to_ptype_v(descs, &rx_pkts[pos]);
43697f17497SC.J. Collier		/* C.4 calc avaialbe number of desc */
43747d9763aSLuca Boccassi		if (unlikely(stat == 0)) {
43847d9763aSLuca Boccassi			nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP;
43947d9763aSLuca Boccassi		} else {
44047d9763aSLuca Boccassi			nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT;
44197f17497SC.J. Collier			break;
44247d9763aSLuca Boccassi		}
44397f17497SC.J. Collier	}
44497f17497SC.J. Collier
44597f17497SC.J. Collier	/* Update our internal tail pointer */
44697f17497SC.J. Collier	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
44797f17497SC.J. Collier	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
44897f17497SC.J. Collier	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
44997f17497SC.J. Collier
45097f17497SC.J. Collier	return nb_pkts_recd;
45197f17497SC.J. Collier}
45297f17497SC.J. Collier
45397f17497SC.J. Collier /*
45497f17497SC.J. Collier * Notice:
45597f17497SC.J. Collier * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
45697f17497SC.J. Collier * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
45797f17497SC.J. Collier *   numbers of DD bits
45897f17497SC.J. Collier */
45997f17497SC.J. Collieruint16_t
46097f17497SC.J. Collieri40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
46197f17497SC.J. Collier		   uint16_t nb_pkts)
46297f17497SC.J. Collier{
46397f17497SC.J. Collier	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
46497f17497SC.J. Collier}
46597f17497SC.J. Collier
46697f17497SC.J. Collier /* vPMD receive routine that reassembles scattered packets
46797f17497SC.J. Collier * Notice:
46897f17497SC.J. Collier * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
46997f17497SC.J. Collier * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
47097f17497SC.J. Collier *   numbers of DD bits
47197f17497SC.J. Collier */
47297f17497SC.J. Collieruint16_t
47397f17497SC.J. Collieri40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
47497f17497SC.J. Collier			     uint16_t nb_pkts)
47597f17497SC.J. Collier{
47697f17497SC.J. Collier
47797f17497SC.J. Collier	struct i40e_rx_queue *rxq = rx_queue;
47897f17497SC.J. Collier	uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
47997f17497SC.J. Collier
48097f17497SC.J. Collier	/* get some new buffers */
48197f17497SC.J. Collier	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
48297f17497SC.J. Collier			split_flags);
48397f17497SC.J. Collier	if (nb_bufs == 0)
48497f17497SC.J. Collier		return 0;
48597f17497SC.J. Collier
48697f17497SC.J. Collier	/* happy day case, full burst + no packets to be joined */
48797f17497SC.J. Collier	const uint64_t *split_fl64 = (uint64_t *)split_flags;
48897f17497SC.J. Collier
48997f17497SC.J. Collier	if (rxq->pkt_first_seg == NULL &&
49097f17497SC.J. Collier			split_fl64[0] == 0 && split_fl64[1] == 0 &&
49197f17497SC.J. Collier			split_fl64[2] == 0 && split_fl64[3] == 0)
49297f17497SC.J. Collier		return nb_bufs;
49397f17497SC.J. Collier
49497f17497SC.J. Collier	/* reassemble any packets that need reassembly*/
49597f17497SC.J. Collier	unsigned i = 0;
49697f17497SC.J. Collier
49797f17497SC.J. Collier	if (rxq->pkt_first_seg == NULL) {
49897f17497SC.J. Collier		/* find the first split flag, and only reassemble then*/
49997f17497SC.J. Collier		while (i < nb_bufs && !split_flags[i])
50097f17497SC.J. Collier			i++;
50197f17497SC.J. Collier		if (i == nb_bufs)
50297f17497SC.J. Collier			return nb_bufs;
50397f17497SC.J. Collier	}
50497f17497SC.J. Collier	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
50597f17497SC.J. Collier		&split_flags[i]);
50697f17497SC.J. Collier}
50797f17497SC.J. Collier
50897f17497SC.J. Collierstatic inline void
50997f17497SC.J. Colliervtx1(volatile struct i40e_tx_desc *txdp,
51097f17497SC.J. Collier		struct rte_mbuf *pkt, uint64_t flags)
51197f17497SC.J. Collier{
51297f17497SC.J. Collier	uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
51397f17497SC.J. Collier			((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT) |
51497f17497SC.J. Collier			((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
51597f17497SC.J. Collier
5166b3e017eSChristian Ehrhardt	uint64x2_t descriptor = {pkt->buf_physaddr + pkt->data_off, high_qw};
5176b3e017eSChristian Ehrhardt	vst1q_u64((uint64_t *)txdp, descriptor);
51897f17497SC.J. Collier}
51997f17497SC.J. Collier
52097f17497SC.J. Collierstatic inline void
52197f17497SC.J. Colliervtx(volatile struct i40e_tx_desc *txdp,
52297f17497SC.J. Collier		struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
52397f17497SC.J. Collier{
52497f17497SC.J. Collier	int i;
52597f17497SC.J. Collier
52697f17497SC.J. Collier	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
52797f17497SC.J. Collier		vtx1(txdp, *pkt, flags);
52897f17497SC.J. Collier}
52997f17497SC.J. Collier
53097f17497SC.J. Collieruint16_t
53197f17497SC.J. Collieri40e_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
53297f17497SC.J. Collier		   uint16_t nb_pkts)
53397f17497SC.J. Collier{
53497f17497SC.J. Collier	struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
53597f17497SC.J. Collier	volatile struct i40e_tx_desc *txdp;
53697f17497SC.J. Collier	struct i40e_tx_entry *txep;
53797f17497SC.J. Collier	uint16_t n, nb_commit, tx_id;
53897f17497SC.J. Collier	uint64_t flags = I40E_TD_CMD;
53997f17497SC.J. Collier	uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
54097f17497SC.J. Collier	int i;
54197f17497SC.J. Collier
54297f17497SC.J. Collier	/* cross rx_thresh boundary is not allowed */
54397f17497SC.J. Collier	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
54497f17497SC.J. Collier
54597f17497SC.J. Collier	if (txq->nb_tx_free < txq->tx_free_thresh)
54697f17497SC.J. Collier		i40e_tx_free_bufs(txq);
54797f17497SC.J. Collier
54897f17497SC.J. Collier	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
54997f17497SC.J. Collier	if (unlikely(nb_pkts == 0))
55097f17497SC.J. Collier		return 0;
55197f17497SC.J. Collier
55297f17497SC.J. Collier	tx_id = txq->tx_tail;
55397f17497SC.J. Collier	txdp = &txq->tx_ring[tx_id];
55497f17497SC.J. Collier	txep = &txq->sw_ring[tx_id];
55597f17497SC.J. Collier
55697f17497SC.J. Collier	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
55797f17497SC.J. Collier
55897f17497SC.J. Collier	n = (uint16_t)(txq->nb_tx_desc - tx_id);
55997f17497SC.J. Collier	if (nb_commit >= n) {
56097f17497SC.J. Collier		tx_backlog_entry(txep, tx_pkts, n);
56197f17497SC.J. Collier
56297f17497SC.J. Collier		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
56397f17497SC.J. Collier			vtx1(txdp, *tx_pkts, flags);
56497f17497SC.J. Collier
56597f17497SC.J. Collier		vtx1(txdp, *tx_pkts++, rs);
56697f17497SC.J. Collier
56797f17497SC.J. Collier		nb_commit = (uint16_t)(nb_commit - n);
56897f17497SC.J. Collier
56997f17497SC.J. Collier		tx_id = 0;
57097f17497SC.J. Collier		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
57197f17497SC.J. Collier
57297f17497SC.J. Collier		/* avoid reach the end of ring */
57397f17497SC.J. Collier		txdp = &txq->tx_ring[tx_id];
57497f17497SC.J. Collier		txep = &txq->sw_ring[tx_id];
57597f17497SC.J. Collier	}
57697f17497SC.J. Collier
57797f17497SC.J. Collier	tx_backlog_entry(txep, tx_pkts, nb_commit);
57897f17497SC.J. Collier
57997f17497SC.J. Collier	vtx(txdp, tx_pkts, nb_commit, flags);
58097f17497SC.J. Collier
58197f17497SC.J. Collier	tx_id = (uint16_t)(tx_id + nb_commit);
58297f17497SC.J. Collier	if (tx_id > txq->tx_next_rs) {
58397f17497SC.J. Collier		txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
58497f17497SC.J. Collier			rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
58597f17497SC.J. Collier						I40E_TXD_QW1_CMD_SHIFT);
58697f17497SC.J. Collier		txq->tx_next_rs =
58797f17497SC.J. Collier			(uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
58897f17497SC.J. Collier	}
58997f17497SC.J. Collier
59097f17497SC.J. Collier	txq->tx_tail = tx_id;
59197f17497SC.J. Collier
59297f17497SC.J. Collier	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
59397f17497SC.J. Collier
59497f17497SC.J. Collier	return nb_pkts;
59597f17497SC.J. Collier}
59697f17497SC.J. Collier
59797f17497SC.J. Colliervoid __attribute__((cold))
59897f17497SC.J. Collieri40e_rx_queue_release_mbufs_vec(struct i40e_rx_queue *rxq)
59997f17497SC.J. Collier{
6006b3e017eSChristian Ehrhardt	_i40e_rx_queue_release_mbufs_vec(rxq);
60197f17497SC.J. Collier}
60297f17497SC.J. Collier
60397f17497SC.J. Collierint __attribute__((cold))
60497f17497SC.J. Collieri40e_rxq_vec_setup(struct i40e_rx_queue *rxq)
60597f17497SC.J. Collier{
6066b3e017eSChristian Ehrhardt	return i40e_rxq_vec_setup_default(rxq);
60797f17497SC.J. Collier}
60897f17497SC.J. Collier
60997f17497SC.J. Collierint __attribute__((cold))
61097f17497SC.J. Collieri40e_txq_vec_setup(struct i40e_tx_queue __rte_unused *txq)
61197f17497SC.J. Collier{
61297f17497SC.J. Collier	return 0;
61397f17497SC.J. Collier}
61497f17497SC.J. Collier
61597f17497SC.J. Collierint __attribute__((cold))
61697f17497SC.J. Collieri40e_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
61797f17497SC.J. Collier{
6186b3e017eSChristian Ehrhardt	return i40e_rx_vec_dev_conf_condition_check_default(dev);
61997f17497SC.J. Collier}