335129044dSC.J. Collier
345129044dSC.J. Collier#include <stdint.h>
355129044dSC.J. Collier#include <rte_ethdev.h>
365129044dSC.J. Collier#include <rte_malloc.h>
375129044dSC.J. Collier
385129044dSC.J. Collier#include "ixgbe_ethdev.h"
395129044dSC.J. Collier#include "ixgbe_rxtx.h"
40809f0800SChristian Ehrhardt#include "ixgbe_rxtx_vec_common.h"
415129044dSC.J. Collier
425129044dSC.J. Collier#include <tmmintrin.h>
435129044dSC.J. Collier
445129044dSC.J. Collier#ifndef __INTEL_COMPILER
455129044dSC.J. Collier#pragma GCC diagnostic ignored "-Wcast-qual"
465129044dSC.J. Collier#endif
475129044dSC.J. Collier
485129044dSC.J. Collierstatic inline void
495129044dSC.J. Collierixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
505129044dSC.J. Collier{
515129044dSC.J. Collier	int i;
525129044dSC.J. Collier	uint16_t rx_id;
535129044dSC.J. Collier	volatile union ixgbe_adv_rx_desc *rxdp;
545129044dSC.J. Collier	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
555129044dSC.J. Collier	struct rte_mbuf *mb0, *mb1;
565129044dSC.J. Collier	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
575129044dSC.J. Collier			RTE_PKTMBUF_HEADROOM);
585129044dSC.J. Collier	__m128i dma_addr0, dma_addr1;
595129044dSC.J. Collier
605129044dSC.J. Collier	const __m128i hba_msk = _mm_set_epi64x(0, UINT64_MAX);
615129044dSC.J. Collier
625129044dSC.J. Collier	rxdp = rxq->rx_ring + rxq->rxrearm_start;
635129044dSC.J. Collier
645129044dSC.J. Collier	/* Pull 'n' more MBUFs into the software ring */
655129044dSC.J. Collier	if (rte_mempool_get_bulk(rxq->mb_pool,
665129044dSC.J. Collier				 (void *)rxep,
675129044dSC.J. Collier				 RTE_IXGBE_RXQ_REARM_THRESH) < 0) {
685129044dSC.J. Collier		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
695129044dSC.J. Collier		    rxq->nb_rx_desc) {
705129044dSC.J. Collier			dma_addr0 = _mm_setzero_si128();
715129044dSC.J. Collier			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
725129044dSC.J. Collier				rxep[i].mbuf = &rxq->fake_mbuf;
735129044dSC.J. Collier				_mm_store_si128((__m128i *)&rxdp[i].read,
745129044dSC.J. Collier						dma_addr0);
755129044dSC.J. Collier			}
765129044dSC.J. Collier		}
775129044dSC.J. Collier		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
785129044dSC.J. Collier			RTE_IXGBE_RXQ_REARM_THRESH;
795129044dSC.J. Collier		return;
805129044dSC.J. Collier	}
815129044dSC.J. Collier
825129044dSC.J. Collier	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
835129044dSC.J. Collier	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
845129044dSC.J. Collier		__m128i vaddr0, vaddr1;
855129044dSC.J. Collier		uintptr_t p0, p1;
865129044dSC.J. Collier
875129044dSC.J. Collier		mb0 = rxep[0].mbuf;
885129044dSC.J. Collier		mb1 = rxep[1].mbuf;
895129044dSC.J. Collier
905129044dSC.J. Collier		/*
915129044dSC.J. Collier		 * Flush mbuf with pkt template.
925129044dSC.J. Collier		 * Data to be rearmed is 6 bytes long.
935129044dSC.J. Collier		 * Though, RX will overwrite ol_flags that are coming next
945129044dSC.J. Collier		 * anyway. So overwrite whole 8 bytes with one load:
955129044dSC.J. Collier		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
965129044dSC.J. Collier		 */
975129044dSC.J. Collier		p0 = (uintptr_t)&mb0->rearm_data;
985129044dSC.J. Collier		*(uint64_t *)p0 = rxq->mbuf_initializer;
995129044dSC.J. Collier		p1 = (uintptr_t)&mb1->rearm_data;
1005129044dSC.J. Collier		*(uint64_t *)p1 = rxq->mbuf_initializer;
1015129044dSC.J. Collier
1025129044dSC.J. Collier		/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
1035129044dSC.J. Collier		vaddr0 = _mm_loadu_si128((__m128i *)&(mb0->buf_addr));
1045129044dSC.J. Collier		vaddr1 = _mm_loadu_si128((__m128i *)&(mb1->buf_addr));
1055129044dSC.J. Collier
1065129044dSC.J. Collier		/* convert pa to dma_addr hdr/data */
1075129044dSC.J. Collier		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
1085129044dSC.J. Collier		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
1095129044dSC.J. Collier
1105129044dSC.J. Collier		/* add headroom to pa values */
1115129044dSC.J. Collier		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
1125129044dSC.J. Collier		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
1135129044dSC.J. Collier
1145129044dSC.J. Collier		/* set Header Buffer Address to zero */
1155129044dSC.J. Collier		dma_addr0 =  _mm_and_si128(dma_addr0, hba_msk);
1165129044dSC.J. Collier		dma_addr1 =  _mm_and_si128(dma_addr1, hba_msk);
1175129044dSC.J. Collier
1185129044dSC.J. Collier		/* flush desc with pa dma_addr */
1195129044dSC.J. Collier		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
1205129044dSC.J. Collier		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
1215129044dSC.J. Collier	}
1225129044dSC.J. Collier
1235129044dSC.J. Collier	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
1245129044dSC.J. Collier	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
1255129044dSC.J. Collier		rxq->rxrearm_start = 0;
1265129044dSC.J. Collier
1275129044dSC.J. Collier	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;
1285129044dSC.J. Collier
1295129044dSC.J. Collier	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
1305129044dSC.J. Collier			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
1315129044dSC.J. Collier
1325129044dSC.J. Collier	/* Update the tail pointer on the NIC */
1335129044dSC.J. Collier	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1345129044dSC.J. Collier}
1355129044dSC.J. Collier
1365129044dSC.J. Collier/* Handling the offload flags (olflags) field takes computation
1375129044dSC.J. Collier * time when receiving packets. Therefore we provide a flag to disable
1385129044dSC.J. Collier * the processing of the olflags field when they are not needed. This
1395129044dSC.J. Collier * gives improved performance, at the cost of losing the offload info
1405129044dSC.J. Collier * in the received packet
1415129044dSC.J. Collier */
1425129044dSC.J. Collier#ifdef RTE_IXGBE_RX_OLFLAGS_ENABLE
1435129044dSC.J. Collier
1445129044dSC.J. Collierstatic inline void
145809f0800SChristian Ehrhardtdesc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags,
146809f0800SChristian Ehrhardt	struct rte_mbuf **rx_pkts)
1475129044dSC.J. Collier{
1483d9b7210SChristian Ehrhardt	__m128i ptype0, ptype1, vtag0, vtag1, csum;
1495129044dSC.J. Collier	union {
1505129044dSC.J. Collier		uint16_t e[4];
1515129044dSC.J. Collier		uint64_t dword;
1525129044dSC.J. Collier	} vol;
1535129044dSC.J. Collier
1545129044dSC.J. Collier	/* mask everything except rss type */
1555129044dSC.J. Collier	const __m128i rsstype_msk = _mm_set_epi16(
1565129044dSC.J. Collier			0x0000, 0x0000, 0x0000, 0x0000,
1575129044dSC.J. Collier			0x000F, 0x000F, 0x000F, 0x000F);
1585129044dSC.J. Collier
1593d9b7210SChristian Ehrhardt	/* mask the lower byte of ol_flags */
1603d9b7210SChristian Ehrhardt	const __m128i ol_flags_msk = _mm_set_epi16(
1613d9b7210SChristian Ehrhardt			0x0000, 0x0000, 0x0000, 0x0000,
1623d9b7210SChristian Ehrhardt			0x00FF, 0x00FF, 0x00FF, 0x00FF);
1633d9b7210SChristian Ehrhardt
1645129044dSC.J. Collier	/* map rss type to rss hash flag */
1655129044dSC.J. Collier	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
1665129044dSC.J. Collier			0, 0, 0, PKT_RX_RSS_HASH,
1675129044dSC.J. Collier			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
1685129044dSC.J. Collier			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);
1695129044dSC.J. Collier
1703d9b7210SChristian Ehrhardt	/* mask everything except vlan present and l4/ip csum error */
1713d9b7210SChristian Ehrhardt	const __m128i vlan_csum_msk = _mm_set_epi16(
1723d9b7210SChristian Ehrhardt		(IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16,
1733d9b7210SChristian Ehrhardt		(IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16,
1743d9b7210SChristian Ehrhardt		(IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16,
1753d9b7210SChristian Ehrhardt		(IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16,
1763d9b7210SChristian Ehrhardt		IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP,
1773d9b7210SChristian Ehrhardt		IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP);
1783d9b7210SChristian Ehrhardt	/* map vlan present (0x8), IPE (0x2), L4E (0x1) to ol_flags */
1793d9b7210SChristian Ehrhardt	const __m128i vlan_csum_map_lo = _mm_set_epi8(
1803d9b7210SChristian Ehrhardt		0, 0, 0, 0,
1813d9b7210SChristian Ehrhardt		vlan_flags | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD,
1823d9b7210SChristian Ehrhardt		vlan_flags | PKT_RX_IP_CKSUM_BAD,
1833d9b7210SChristian Ehrhardt		vlan_flags | PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD,
1843d9b7210SChristian Ehrhardt		vlan_flags | PKT_RX_IP_CKSUM_GOOD,
1853d9b7210SChristian Ehrhardt		0, 0, 0, 0,
1863d9b7210SChristian Ehrhardt		PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD,
1873d9b7210SChristian Ehrhardt		PKT_RX_IP_CKSUM_BAD,
1883d9b7210SChristian Ehrhardt		PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD,
1893d9b7210SChristian Ehrhardt		PKT_RX_IP_CKSUM_GOOD);
1903d9b7210SChristian Ehrhardt
1913d9b7210SChristian Ehrhardt	const __m128i vlan_csum_map_hi = _mm_set_epi8(
192809f0800SChristian Ehrhardt		0, 0, 0, 0,
1933d9b7210SChristian Ehrhardt		0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0,
1943d9b7210SChristian Ehrhardt		PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t),
195809f0800SChristian Ehrhardt		0, 0, 0, 0,
1963d9b7210SChristian Ehrhardt		0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0,
1973d9b7210SChristian Ehrhardt		PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t));
198809f0800SChristian Ehrhardt
1995129044dSC.J. Collier	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
2005129044dSC.J. Collier	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
2015129044dSC.J. Collier	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
2025129044dSC.J. Collier	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);
2035129044dSC.J. Collier
2045129044dSC.J. Collier	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
2055129044dSC.J. Collier	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
2065129044dSC.J. Collier	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);
2075129044dSC.J. Collier
2085129044dSC.J. Collier	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
2093d9b7210SChristian Ehrhardt	vtag1 = _mm_and_si128(vtag1, vlan_csum_msk);
2103d9b7210SChristian Ehrhardt
2113d9b7210SChristian Ehrhardt	/* csum bits are in the most significant, to use shuffle we need to
2123d9b7210SChristian Ehrhardt	 * shift them. Change mask to 0xc000 to 0x0003.
2133d9b7210SChristian Ehrhardt	 */
2143d9b7210SChristian Ehrhardt	csum = _mm_srli_epi16(vtag1, 14);
2153d9b7210SChristian Ehrhardt
2163d9b7210SChristian Ehrhardt	/* now or the most significant 64 bits containing the checksum
2173d9b7210SChristian Ehrhardt	 * flags with the vlan present flags.
2183d9b7210SChristian Ehrhardt	 */
2193d9b7210SChristian Ehrhardt	csum = _mm_srli_si128(csum, 8);
2203d9b7210SChristian Ehrhardt	vtag1 = _mm_or_si128(csum, vtag1);
2213d9b7210SChristian Ehrhardt
2223d9b7210SChristian Ehrhardt	/* convert VP, IPE, L4E to ol_flags */
2233d9b7210SChristian Ehrhardt	vtag0 = _mm_shuffle_epi8(vlan_csum_map_hi, vtag1);
2243d9b7210SChristian Ehrhardt	vtag0 = _mm_slli_epi16(vtag0, sizeof(uint8_t));
2253d9b7210SChristian Ehrhardt
2263d9b7210SChristian Ehrhardt	vtag1 = _mm_shuffle_epi8(vlan_csum_map_lo, vtag1);
2273d9b7210SChristian Ehrhardt	vtag1 = _mm_and_si128(vtag1, ol_flags_msk);
2283d9b7210SChristian Ehrhardt	vtag1 = _mm_or_si128(vtag0, vtag1);
2295129044dSC.J. Collier
2305129044dSC.J. Collier	vtag1 = _mm_or_si128(ptype0, vtag1);
2315129044dSC.J. Collier	vol.dword = _mm_cvtsi128_si64(vtag1);
2325129044dSC.J. Collier
2335129044dSC.J. Collier	rx_pkts[0]->ol_flags = vol.e[0];
2345129044dSC.J. Collier	rx_pkts[1]->ol_flags = vol.e[1];
2355129044dSC.J. Collier	rx_pkts[2]->ol_flags = vol.e[2];
2365129044dSC.J. Collier	rx_pkts[3]->ol_flags = vol.e[3];
2375129044dSC.J. Collier}
2385129044dSC.J. Collier#else
2398be94df6SRicardo Salveti#define desc_to_olflags_v(desc, vlan_flags, rx_pkts) do { \
2408be94df6SRicardo Salveti		RTE_SET_USED(vlan_flags); \
2418be94df6SRicardo Salveti	} while (0)
2425129044dSC.J. Collier#endif
2435129044dSC.J. Collier
2445129044dSC.J. Collier/*
2455129044dSC.J. Collier * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
2465129044dSC.J. Collier *
2475129044dSC.J. Collier * Notice:
2485129044dSC.J. Collier * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
2495129044dSC.J. Collier * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
2505129044dSC.J. Collier *   numbers of DD bit
2515129044dSC.J. Collier * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
2525129044dSC.J. Collier */
2535129044dSC.J. Collierstatic inline uint16_t
2545129044dSC.J. Collier_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
2555129044dSC.J. Collier		uint16_t nb_pkts, uint8_t *split_packet)
2565129044dSC.J. Collier{
2575129044dSC.J. Collier	volatile union ixgbe_adv_rx_desc *rxdp;
2585129044dSC.J. Collier	struct ixgbe_rx_entry *sw_ring;
2595129044dSC.J. Collier	uint16_t nb_pkts_recd;
2605129044dSC.J. Collier	int pos;
2615129044dSC.J. Collier	uint64_t var;
2625129044dSC.J. Collier	__m128i shuf_msk;
2635129044dSC.J. Collier	__m128i crc_adjust = _mm_set_epi16(
2645129044dSC.J. Collier				0, 0, 0,    /* ignore non-length fields */
2655129044dSC.J. Collier				-rxq->crc_len, /* sub crc on data_len */
2665129044dSC.J. Collier				0,          /* ignore high-16bits of pkt_len */
2675129044dSC.J. Collier				-rxq->crc_len, /* sub crc on pkt_len */
2685129044dSC.J. Collier				0, 0            /* ignore pkt_type field */
2695129044dSC.J. Collier			);
2705129044dSC.J. Collier	__m128i dd_check, eop_check;
271809f0800SChristian Ehrhardt	uint8_t vlan_flags;
2725129044dSC.J. Collier
2735129044dSC.J. Collier	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
2745129044dSC.J. Collier	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);
2755129044dSC.J. Collier
2765129044dSC.J. Collier	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
2775129044dSC.J. Collier	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);
2785129044dSC.J. Collier
2795129044dSC.J. Collier	/* Just the act of getting into the function from the application is
280809f0800SChristian Ehrhardt	 * going to cost about 7 cycles
281809f0800SChristian Ehrhardt	 */
2825129044dSC.J. Collier	rxdp = rxq->rx_ring + rxq->rx_tail;
2835129044dSC.J. Collier
2843d9b7210SChristian Ehrhardt	rte_prefetch0(rxdp);
2855129044dSC.J. Collier
2865129044dSC.J. Collier	/* See if we need to rearm the RX queue - gives the prefetch a bit
287809f0800SChristian Ehrhardt	 * of time to act
288809f0800SChristian Ehrhardt	 */
2895129044dSC.J. Collier	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
2905129044dSC.J. Collier		ixgbe_rxq_rearm(rxq);
2915129044dSC.J. Collier
2925129044dSC.J. Collier	/* Before we start moving massive data around, check to see if
293809f0800SChristian Ehrhardt	 * there is actually a packet available
294809f0800SChristian Ehrhardt	 */
2955129044dSC.J. Collier	if (!(rxdp->wb.upper.status_error &
2965129044dSC.J. Collier				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
2975129044dSC.J. Collier		return 0;
2985129044dSC.J. Collier
2995129044dSC.J. Collier	/* 4 packets DD mask */
3005129044dSC.J. Collier	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);
3015129044dSC.J. Collier
3025129044dSC.J. Collier	/* 4 packets EOP mask */
3035129044dSC.J. Collier	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);
3045129044dSC.J. Collier
3055129044dSC.J. Collier	/* mask to shuffle from desc. to mbuf */
3065129044dSC.J. Collier	shuf_msk = _mm_set_epi8(
3075129044dSC.J. Collier		7, 6, 5, 4,  /* octet 4~7, 32bits rss */
3085129044dSC.J. Collier		15, 14,      /* octet 14~15, low 16 bits vlan_macip */
3095129044dSC.J. Collier		13, 12,      /* octet 12~13, 16 bits data_len */
3105129044dSC.J. Collier		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
3115129044dSC.J. Collier		13, 12,      /* octet 12~13, low 16 bits pkt_len */
3125129044dSC.J. Collier		0xFF, 0xFF,  /* skip 32 bit pkt_type */
3135129044dSC.J. Collier		0xFF, 0xFF
3145129044dSC.J. Collier		);
3155129044dSC.J. Collier
3165129044dSC.J. Collier	/* Cache is empty -> need to scan the buffer rings, but first move
317809f0800SChristian Ehrhardt	 * the next 'n' mbufs into the cache
318809f0800SChristian Ehrhardt	 */
3195129044dSC.J. Collier	sw_ring = &rxq->sw_ring[rxq->rx_tail];
3205129044dSC.J. Collier
321809f0800SChristian Ehrhardt	/* ensure these 2 flags are in the lower 8 bits */
323809f0800SChristian Ehrhardt	vlan_flags = rxq->vlan_flags & UINT8_MAX;
324809f0800SChristian Ehrhardt
3255129044dSC.J. Collier	/* A. load 4 packet in one loop
3265129044dSC.J. Collier	 * [A*. mask out 4 unused dirty field in desc]
3275129044dSC.J. Collier	 * B. copy 4 mbuf point from swring to rx_pkts
3285129044dSC.J. Collier	 * C. calc the number of DD bits among the 4 packets
3295129044dSC.J. Collier	 * [C*. extract the end-of-packet bit, if requested]
3305129044dSC.J. Collier	 * D. fill info. from desc to mbuf
3315129044dSC.J. Collier	 */
3325129044dSC.J. Collier	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
3335129044dSC.J. Collier			pos += RTE_IXGBE_DESCS_PER_LOOP,
3345129044dSC.J. Collier			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
3355129044dSC.J. Collier		__m128i descs[RTE_IXGBE_DESCS_PER_LOOP];
3365129044dSC.J. Collier		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
3375129044dSC.J. Collier		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
3389365d6cfSChristian Ehrhardt		/* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */
3399365d6cfSChristian Ehrhardt		__m128i mbp1;
3409365d6cfSChristian Ehrhardt#if defined(RTE_ARCH_X86_64)
3419365d6cfSChristian Ehrhardt		__m128i mbp2;
3429365d6cfSChristian Ehrhardt#endif
3435129044dSC.J. Collier
3449365d6cfSChristian Ehrhardt		/* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */
3455129044dSC.J. Collier		mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);
3465129044dSC.J. Collier
3475129044dSC.J. Collier		/* Read desc statuses backwards to avoid race condition */
3485129044dSC.J. Collier		/* A.1 load 4 pkts desc */
3495129044dSC.J. Collier		descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));
350c300e355SChristian Ehrhardt		rte_compiler_barrier();
3515129044dSC.J. Collier
3529365d6cfSChristian Ehrhardt		/* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */
3535129044dSC.J. Collier		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);
3545129044dSC.J. Collier
3559365d6cfSChristian Ehrhardt#if defined(RTE_ARCH_X86_64)
3569365d6cfSChristian Ehrhardt		/* B.1 load 2 64 bit mbuf points */
3575129044dSC.J. Collier		mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]);
3589365d6cfSChristian Ehrhardt#endif
3595129044dSC.J. Collier
3605129044dSC.J. Collier		descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
361c300e355SChristian Ehrhardt		rte_compiler_barrier();
3625129044dSC.J. Collier		/* B.1 load 2 mbuf point */
3635129044dSC.J. Collier		descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
364c300e355SChristian Ehrhardt		rte_compiler_barrier();
3655129044dSC.J. Collier		descs[0] = _mm_loadu_si128((__m128i *)(rxdp));
3665129044dSC.J. Collier
3679365d6cfSChristian Ehrhardt#if defined(RTE_ARCH_X86_64)
3685129044dSC.J. Collier		/* B.2 copy 2 mbuf point into rx_pkts  */
3695129044dSC.J. Collier		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
3709365d6cfSChristian Ehrhardt#endif
3715129044dSC.J. Collier
3725129044dSC.J. Collier		if (split_packet) {
373809f0800SChristian Ehrhardt			rte_mbuf_prefetch_part2(rx_pkts[pos]);
374809f0800SChristian Ehrhardt			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
375809f0800SChristian Ehrhardt			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
376809f0800SChristian Ehrhardt			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
3775129044dSC.J. Collier		}
3785129044dSC.J. Collier
3795129044dSC.J. Collier		/* avoid compiler reorder optimization */
3805129044dSC.J. Collier		rte_compiler_barrier();
3815129044dSC.J. Collier
3825129044dSC.J. Collier		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
3835129044dSC.J. Collier		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
3845129044dSC.J. Collier		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);
3855129044dSC.J. Collier
3865129044dSC.J. Collier		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
3875129044dSC.J. Collier		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
3885129044dSC.J. Collier		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);
3895129044dSC.J. Collier
3905129044dSC.J. Collier		/* C.1 4=>2 filter staterr info only */
3915129044dSC.J. Collier		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
3925129044dSC.J. Collier		/* C.1 4=>2 filter staterr info only */
3935129044dSC.J. Collier		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
3945129044dSC.J. Collier
3955129044dSC.J. Collier		/* set ol_flags with vlan packet type */
396809f0800SChristian Ehrhardt		desc_to_olflags_v(descs, vlan_flags, &rx_pkts[pos]);
3975129044dSC.J. Collier
3985129044dSC.J. Collier		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
3995129044dSC.J. Collier		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
4005129044dSC.J. Collier		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);
4015129044dSC.J. Collier
4025129044dSC.J. Collier		/* C.2 get 4 pkts staterr value  */
4035129044dSC.J. Collier		zero = _mm_xor_si128(dd_check, dd_check);
4045129044dSC.J. Collier		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);
4055129044dSC.J. Collier
4065129044dSC.J. Collier		/* D.3 copy final 3,4 data to rx_pkts */
4075129044dSC.J. Collier		_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
4085129044dSC.J. Collier				pkt_mb4);
4095129044dSC.J. Collier		_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
4105129044dSC.J. Collier				pkt_mb3);
4115129044dSC.J. Collier
4125129044dSC.J. Collier		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
4135129044dSC.J. Collier		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
4145129044dSC.J. Collier		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
4155129044dSC.J. Collier
4165129044dSC.J. Collier		/* C* extract and record EOP bit */
4175129044dSC.J. Collier		if (split_packet) {
4185129044dSC.J. Collier			__m128i eop_shuf_mask = _mm_set_epi8(
4195129044dSC.J. Collier					0xFF, 0xFF, 0xFF, 0xFF,
4205129044dSC.J. Collier					0xFF, 0xFF, 0xFF, 0xFF,
4215129044dSC.J. Collier					0xFF, 0xFF, 0xFF, 0xFF,
4225129044dSC.J. Collier					0x04, 0x0C, 0x00, 0x08
4235129044dSC.J. Collier					);
4245129044dSC.J. Collier
4255129044dSC.J. Collier			/* and with mask to extract bits, flipping 1-0 */
4265129044dSC.J. Collier			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
4275129044dSC.J. Collier			/* the staterr values are not in order, as the count
4285129044dSC.J. Collier			 * count of dd bits doesn't care. However, for end of
4295129044dSC.J. Collier			 * packet tracking, we do care, so shuffle. This also
430809f0800SChristian Ehrhardt			 * compresses the 32-bit values to 8-bit
431809f0800SChristian Ehrhardt			 */
4325129044dSC.J. Collier			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
4335129044dSC.J. Collier			/* store the resulting 32-bit value */
4345129044dSC.J. Collier			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
4355129044dSC.J. Collier			split_packet += RTE_IXGBE_DESCS_PER_LOOP;
4365129044dSC.J. Collier
4375129044dSC.J. Collier			/* zero-out next pointers */
4385129044dSC.J. Collier			rx_pkts[pos]->next = NULL;
4395129044dSC.J. Collier			rx_pkts[pos + 1]->next = NULL;
4405129044dSC.J. Collier			rx_pkts[pos + 2]->next = NULL;
4415129044dSC.J. Collier			rx_pkts[pos + 3]->next = NULL;
4425129044dSC.J. Collier		}
4435129044dSC.J. Collier
4445129044dSC.J. Collier		/* C.3 calc available number of desc */
4455129044dSC.J. Collier		staterr = _mm_and_si128(staterr, dd_check);
4465129044dSC.J. Collier		staterr = _mm_packs_epi32(staterr, zero);
4475129044dSC.J. Collier
4485129044dSC.J. Collier		/* D.3 copy final 1,2 data to rx_pkts */
4495129044dSC.J. Collier		_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,
4505129044dSC.J. Collier				pkt_mb2);
4515129044dSC.J. Collier		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
4525129044dSC.J. Collier				pkt_mb1);
4535129044dSC.J. Collier
4545129044dSC.J. Collier		/* C.4 calc avaialbe number of desc */
4555129044dSC.J. Collier		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
4565129044dSC.J. Collier		nb_pkts_recd += var;
4575129044dSC.J. Collier		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
4585129044dSC.J. Collier			break;
4595129044dSC.J. Collier	}
4605129044dSC.J. Collier
4615129044dSC.J. Collier	/* Update our internal tail pointer */
4625129044dSC.J. Collier	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
4635129044dSC.J. Collier	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
4645129044dSC.J. Collier	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
4655129044dSC.J. Collier
4665129044dSC.J. Collier	return nb_pkts_recd;
4675129044dSC.J. Collier}
4685129044dSC.J. Collier
4695129044dSC.J. Collier/*
4705129044dSC.J. Collier * vPMD receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
4715129044dSC.J. Collier *
4725129044dSC.J. Collier * Notice:
4735129044dSC.J. Collier * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
4745129044dSC.J. Collier * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
4755129044dSC.J. Collier *   numbers of DD bit
4765129044dSC.J. Collier * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
4775129044dSC.J. Collier */
4785129044dSC.J. Collieruint16_t
4795129044dSC.J. Collierixgbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
4805129044dSC.J. Collier		uint16_t nb_pkts)
4815129044dSC.J. Collier{
4825129044dSC.J. Collier	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
4835129044dSC.J. Collier}
4845129044dSC.J. Collier
4855129044dSC.J. Collier/*
4865129044dSC.J. Collier * vPMD receive routine that reassembles scattered packets
4875129044dSC.J. Collier *
4885129044dSC.J. Collier * Notice:
4895129044dSC.J. Collier * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
4905129044dSC.J. Collier * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
4915129044dSC.J. Collier *   numbers of DD bit
4925129044dSC.J. Collier * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
4935129044dSC.J. Collier */
4945129044dSC.J. Collieruint16_t
4955129044dSC.J. Collierixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
4965129044dSC.J. Collier		uint16_t nb_pkts)
4975129044dSC.J. Collier{
4985129044dSC.J. Collier	struct ixgbe_rx_queue *rxq = rx_queue;
4995129044dSC.J. Collier	uint8_t split_flags[RTE_IXGBE_MAX_RX_BURST] = {0};
5005129044dSC.J. Collier
5015129044dSC.J. Collier	/* get some new buffers */
5025129044dSC.J. Collier	uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
5035129044dSC.J. Collier			split_flags);
5045129044dSC.J. Collier	if (nb_bufs == 0)
5055129044dSC.J. Collier		return 0;
5065129044dSC.J. Collier
5075129044dSC.J. Collier	/* happy day case, full burst + no packets to be joined */
5085129044dSC.J. Collier	const uint64_t *split_fl64 = (uint64_t *)split_flags;
5095129044dSC.J. Collier	if (rxq->pkt_first_seg == NULL &&
5105129044dSC.J. Collier			split_fl64[0] == 0 && split_fl64[1] == 0 &&
5115129044dSC.J. Collier			split_fl64[2] == 0 && split_fl64[3] == 0)
5125129044dSC.J. Collier		return nb_bufs;
5135129044dSC.J. Collier
5145129044dSC.J. Collier	/* reassemble any packets that need reassembly*/
5155129044dSC.J. Collier	unsigned i = 0;
5165129044dSC.J. Collier	if (rxq->pkt_first_seg == NULL) {
5175129044dSC.J. Collier		/* find the first split flag, and only reassemble then*/
5185129044dSC.J. Collier		while (i < nb_bufs && !split_flags[i])
5195129044dSC.J. Collier			i++;
5205129044dSC.J. Collier		if (i == nb_bufs)
5215129044dSC.J. Collier			return nb_bufs;
5225129044dSC.J. Collier	}
5235129044dSC.J. Collier	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
5245129044dSC.J. Collier		&split_flags[i]);
5255129044dSC.J. Collier}
5265129044dSC.J. Collier
5275129044dSC.J. Collierstatic inline void
5285129044dSC.J. Colliervtx1(volatile union ixgbe_adv_tx_desc *txdp,
5295129044dSC.J. Collier		struct rte_mbuf *pkt, uint64_t flags)
5305129044dSC.J. Collier{
5315129044dSC.J. Collier	__m128i descriptor = _mm_set_epi64x((uint64_t)pkt->pkt_len << 46 |
5325129044dSC.J. Collier			flags | pkt->data_len,
5335129044dSC.J. Collier			pkt->buf_physaddr + pkt->data_off);
5345129044dSC.J. Collier	_mm_store_si128((__m128i *)&txdp->read, descriptor);
5355129044dSC.J. Collier}
5365129044dSC.J. Collier
5375129044dSC.J. Collierstatic inline void
5385129044dSC.J. Colliervtx(volatile union ixgbe_adv_tx_desc *txdp,
5395129044dSC.J. Collier		struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
5405129044dSC.J. Collier{
5415129044dSC.J. Collier	int i;
542809f0800SChristian Ehrhardt
5435129044dSC.J. Collier	for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
5445129044dSC.J. Collier		vtx1(txdp, *pkt, flags);
5455129044dSC.J. Collier}
5465129044dSC.J. Collier
5475129044dSC.J. Collieruint16_t
5485129044dSC.J. Collierixgbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
5495129044dSC.J. Collier		       uint16_t nb_pkts)
5505129044dSC.J. Collier{
5515129044dSC.J. Collier	struct ixgbe_tx_queue *txq = (struct ixgbe_tx_queue *)tx_queue;
5525129044dSC.J. Collier	volatile union ixgbe_adv_tx_desc *txdp;
5535129044dSC.J. Collier	struct ixgbe_tx_entry_v *txep;
5545129044dSC.J. Collier	uint16_t n, nb_commit, tx_id;
5555129044dSC.J. Collier	uint64_t flags = DCMD_DTYP_FLAGS;
5565129044dSC.J. Collier	uint64_t rs = IXGBE_ADVTXD_DCMD_RS|DCMD_DTYP_FLAGS;
5575129044dSC.J. Collier	int i;
5585129044dSC.J. Collier
5595129044dSC.J. Collier	/* cross rx_thresh boundary is not allowed */
5605129044dSC.J. Collier	nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
5615129044dSC.J. Collier
5625129044dSC.J. Collier	if (txq->nb_tx_free < txq->tx_free_thresh)
5635129044dSC.J. Collier		ixgbe_tx_free_bufs(txq);
5645129044dSC.J. Collier
5655129044dSC.J. Collier	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
5665129044dSC.J. Collier	if (unlikely(nb_pkts == 0))
5675129044dSC.J. Collier		return 0;
5685129044dSC.J. Collier
5695129044dSC.J. Collier	tx_id = txq->tx_tail;
5705129044dSC.J. Collier	txdp = &txq->tx_ring[tx_id];
5715129044dSC.J. Collier	txep = &txq->sw_ring_v[tx_id];
5725129044dSC.J. Collier
5735129044dSC.J. Collier	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
5745129044dSC.J. Collier
5755129044dSC.J. Collier	n = (uint16_t)(txq->nb_tx_desc - tx_id);
5765129044dSC.J. Collier	if (nb_commit >= n) {
5775129044dSC.J. Collier
5785129044dSC.J. Collier		tx_backlog_entry(txep, tx_pkts, n);
5795129044dSC.J. Collier
5805129044dSC.J. Collier		for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
5815129044dSC.J. Collier			vtx1(txdp, *tx_pkts, flags);
5825129044dSC.J. Collier
5835129044dSC.J. Collier		vtx1(txdp, *tx_pkts++, rs);
5845129044dSC.J. Collier
5855129044dSC.J. Collier		nb_commit = (uint16_t)(nb_commit - n);
5865129044dSC.J. Collier
5875129044dSC.J. Collier		tx_id = 0;
5885129044dSC.J. Collier		txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
5895129044dSC.J. Collier
5905129044dSC.J. Collier		/* avoid reach the end of ring */
5915129044dSC.J. Collier		txdp = &(txq->tx_ring[tx_id]);
5925129044dSC.J. Collier		txep = &txq->sw_ring_v[tx_id];
5935129044dSC.J. Collier	}
5945129044dSC.J. Collier
5955129044dSC.J. Collier	tx_backlog_entry(txep, tx_pkts, nb_commit);
5965129044dSC.J. Collier
5975129044dSC.J. Collier	vtx(txdp, tx_pkts, nb_commit, flags);
5985129044dSC.J. Collier
5995129044dSC.J. Collier	tx_id = (uint16_t)(tx_id + nb_commit);
6005129044dSC.J. Collier	if (tx_id > txq->tx_next_rs) {
6015129044dSC.J. Collier		txq->tx_ring[txq->tx_next_rs].read.cmd_type_len |=
6025129044dSC.J. Collier			rte_cpu_to_le_32(IXGBE_ADVTXD_DCMD_RS);
6035129044dSC.J. Collier		txq->tx_next_rs = (uint16_t)(txq->tx_next_rs +
6045129044dSC.J. Collier			txq->tx_rs_thresh);
6055129044dSC.J. Collier	}
6065129044dSC.J. Collier
6075129044dSC.J. Collier	txq->tx_tail = tx_id;
6085129044dSC.J. Collier
6095129044dSC.J. Collier	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
6105129044dSC.J. Collier
6115129044dSC.J. Collier	return nb_pkts;
6125129044dSC.J. Collier}
6135129044dSC.J. Collier
6145129044dSC.J. Collierstatic void __attribute__((cold))
6155129044dSC.J. Collierixgbe_tx_queue_release_mbufs_vec(struct ixgbe_tx_queue *txq)
6165129044dSC.J. Collier{
617809f0800SChristian Ehrhardt	_ixgbe_tx_queue_release_mbufs_vec(txq);
6185129044dSC.J. Collier}
6195129044dSC.J. Collier
6205129044dSC.J. Colliervoid __attribute__((cold))
6215129044dSC.J. Collierixgbe_rx_queue_release_mbufs_vec(struct ixgbe_rx_queue *rxq)
6225129044dSC.J. Collier{
623809f0800SChristian Ehrhardt	_ixgbe_rx_queue_release_mbufs_vec(rxq);
6245129044dSC.J. Collier}
6255129044dSC.J. Collier
6265129044dSC.J. Collierstatic void __attribute__((cold))
6275129044dSC.J. Collierixgbe_tx_free_swring(struct ixgbe_tx_queue *txq)
6285129044dSC.J. Collier{
629809f0800SChristian Ehrhardt	_ixgbe_tx_free_swring_vec(txq);
6305129044dSC.J. Collier}
6315129044dSC.J. Collier
6325129044dSC.J. Collierstatic void __attribute__((cold))
6335129044dSC.J. Collierixgbe_reset_tx_queue(struct ixgbe_tx_queue *txq)
6345129044dSC.J. Collier{
635809f0800SChristian Ehrhardt	_ixgbe_reset_tx_queue_vec(txq);
6365129044dSC.J. Collier}
6375129044dSC.J. Collier
6385129044dSC.J. Collierstatic const struct ixgbe_txq_ops vec_txq_ops = {
6395129044dSC.J. Collier	.release_mbufs = ixgbe_tx_queue_release_mbufs_vec,
6405129044dSC.J. Collier	.free_swring = ixgbe_tx_free_swring,
6415129044dSC.J. Collier	.reset = ixgbe_reset_tx_queue,
6425129044dSC.J. Collier};
6435129044dSC.J. Collier
6445129044dSC.J. Collierint __attribute__((cold))
6455129044dSC.J. Collierixgbe_rxq_vec_setup(struct ixgbe_rx_queue *rxq)
6465129044dSC.J. Collier{
647809f0800SChristian Ehrhardt	return ixgbe_rxq_vec_setup_default(rxq);
6485129044dSC.J. Collier}
6495129044dSC.J. Collier
6505129044dSC.J. Collierint __attribute__((cold))
6515129044dSC.J. Collierixgbe_txq_vec_setup(struct ixgbe_tx_queue *txq)
6525129044dSC.J. Collier{
653809f0800SChristian Ehrhardt	return ixgbe_txq_vec_setup_default(txq, &vec_txq_ops);
6545129044dSC.J. Collier}
6555129044dSC.J. Collier
6565129044dSC.J. Collierint __attribute__((cold))
6575129044dSC.J. Collierixgbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
6585129044dSC.J. Collier{
659809f0800SChristian Ehrhardt	return ixgbe_rx_vec_dev_conf_condition_check_default(dev);
6605129044dSC.J. Collier}