eal_pci_vfio.c revision a41e6ff1
1/*-
2 *   BSD LICENSE
3 *
4 *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5 *   All rights reserved.
6 *
7 *   Redistribution and use in source and binary forms, with or without
8 *   modification, are permitted provided that the following conditions
9 *   are met:
10 *
11 *     * Redistributions of source code must retain the above copyright
12 *       notice, this list of conditions and the following disclaimer.
13 *     * Redistributions in binary form must reproduce the above copyright
14 *       notice, this list of conditions and the following disclaimer in
15 *       the documentation and/or other materials provided with the
16 *       distribution.
17 *     * Neither the name of Intel Corporation nor the names of its
18 *       contributors may be used to endorse or promote products derived
19 *       from this software without specific prior written permission.
20 *
21 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <string.h>
35#include <fcntl.h>
36#include <linux/pci_regs.h>
37#include <sys/eventfd.h>
38#include <sys/socket.h>
39#include <sys/ioctl.h>
40#include <sys/mman.h>
41
42#include <rte_log.h>
43#include <rte_pci.h>
44#include <rte_eal_memconfig.h>
45#include <rte_malloc.h>
46
47#include "eal_filesystem.h"
48#include "eal_pci_init.h"
49#include "eal_vfio.h"
50#include "eal_private.h"
51
52/**
53 * @file
54 * PCI probing under linux (VFIO version)
55 *
56 * This code tries to determine if the PCI device is bound to VFIO driver,
57 * and initialize it (map BARs, set up interrupts) if that's the case.
58 *
59 * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
60 */
61
62#ifdef VFIO_PRESENT
63
64#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
65#define PAGE_MASK   (~(PAGE_SIZE - 1))
66
67static struct rte_tailq_elem rte_vfio_tailq = {
68	.name = "VFIO_RESOURCE_LIST",
69};
70EAL_REGISTER_TAILQ(rte_vfio_tailq)
71
72int
73pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
74		    void *buf, size_t len, off_t offs)
75{
76	return pread64(intr_handle->vfio_dev_fd, buf, len,
77	       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
78}
79
80int
81pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
82		    const void *buf, size_t len, off_t offs)
83{
84	return pwrite64(intr_handle->vfio_dev_fd, buf, len,
85	       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
86}
87
88/* get PCI BAR number where MSI-X interrupts are */
89static int
90pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
91		      uint32_t *msix_table_size)
92{
93	int ret;
94	uint32_t reg;
95	uint16_t flags;
96	uint8_t cap_id, cap_offset;
97
98	/* read PCI capability pointer from config space */
99	ret = pread64(fd, &reg, sizeof(reg),
100			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
101			PCI_CAPABILITY_LIST);
102	if (ret != sizeof(reg)) {
103		RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
104				"config space!\n");
105		return -1;
106	}
107
108	/* we need first byte */
109	cap_offset = reg & 0xFF;
110
111	while (cap_offset) {
112
113		/* read PCI capability ID */
114		ret = pread64(fd, &reg, sizeof(reg),
115				VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
116				cap_offset);
117		if (ret != sizeof(reg)) {
118			RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI "
119					"config space!\n");
120			return -1;
121		}
122
123		/* we need first byte */
124		cap_id = reg & 0xFF;
125
126		/* if we haven't reached MSI-X, check next capability */
127		if (cap_id != PCI_CAP_ID_MSIX) {
128			ret = pread64(fd, &reg, sizeof(reg),
129					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
130					cap_offset);
131			if (ret != sizeof(reg)) {
132				RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI "
133						"config space!\n");
134				return -1;
135			}
136
137			/* we need second byte */
138			cap_offset = (reg & 0xFF00) >> 8;
139
140			continue;
141		}
142		/* else, read table offset */
143		else {
144			/* table offset resides in the next 4 bytes */
145			ret = pread64(fd, &reg, sizeof(reg),
146					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
147					cap_offset + 4);
148			if (ret != sizeof(reg)) {
149				RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config "
150						"space!\n");
151				return -1;
152			}
153
154			ret = pread64(fd, &flags, sizeof(flags),
155					VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
156					cap_offset + 2);
157			if (ret != sizeof(flags)) {
158				RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config "
159						"space!\n");
160				return -1;
161			}
162
163			*msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
164			*msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
165			*msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
166
167			return 0;
168		}
169	}
170	return 0;
171}
172
173/* set PCI bus mastering */
174static int
175pci_vfio_set_bus_master(int dev_fd)
176{
177	uint16_t reg;
178	int ret;
179
180	ret = pread64(dev_fd, &reg, sizeof(reg),
181			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
182			PCI_COMMAND);
183	if (ret != sizeof(reg)) {
184		RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
185		return -1;
186	}
187
188	/* set the master bit */
189	reg |= PCI_COMMAND_MASTER;
190
191	ret = pwrite64(dev_fd, &reg, sizeof(reg),
192			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
193			PCI_COMMAND);
194
195	if (ret != sizeof(reg)) {
196		RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
197		return -1;
198	}
199
200	return 0;
201}
202
203/* set up interrupt support (but not enable interrupts) */
204static int
205pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
206{
207	int i, ret, intr_idx;
208
209	/* default to invalid index */
210	intr_idx = VFIO_PCI_NUM_IRQS;
211
212	/* get interrupt type from internal config (MSI-X by default, can be
213	 * overriden from the command line
214	 */
215	switch (internal_config.vfio_intr_mode) {
216	case RTE_INTR_MODE_MSIX:
217		intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
218		break;
219	case RTE_INTR_MODE_MSI:
220		intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
221		break;
222	case RTE_INTR_MODE_LEGACY:
223		intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
224		break;
225	/* don't do anything if we want to automatically determine interrupt type */
226	case RTE_INTR_MODE_NONE:
227		break;
228	default:
229		RTE_LOG(ERR, EAL, "  unknown default interrupt type!\n");
230		return -1;
231	}
232
233	/* start from MSI-X interrupt type */
234	for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
235		struct vfio_irq_info irq = { .argsz = sizeof(irq) };
236		int fd = -1;
237
238		/* skip interrupt modes we don't want */
239		if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE &&
240				i != intr_idx)
241			continue;
242
243		irq.index = i;
244
245		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
246		if (ret < 0) {
247			RTE_LOG(ERR, EAL, "  cannot get IRQ info, "
248					"error %i (%s)\n", errno, strerror(errno));
249			return -1;
250		}
251
252		/* if this vector cannot be used with eventfd, fail if we explicitly
253		 * specified interrupt type, otherwise continue */
254		if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
255			if (internal_config.vfio_intr_mode != RTE_INTR_MODE_NONE) {
256				RTE_LOG(ERR, EAL,
257						"  interrupt vector does not support eventfd!\n");
258				return -1;
259			} else
260				continue;
261		}
262
263		/* set up an eventfd for interrupts */
264		fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
265		if (fd < 0) {
266			RTE_LOG(ERR, EAL, "  cannot set up eventfd, "
267					"error %i (%s)\n", errno, strerror(errno));
268			return -1;
269		}
270
271		dev->intr_handle.fd = fd;
272		dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
273
274		switch (i) {
275		case VFIO_PCI_MSIX_IRQ_INDEX:
276			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSIX;
277			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
278			break;
279		case VFIO_PCI_MSI_IRQ_INDEX:
280			internal_config.vfio_intr_mode = RTE_INTR_MODE_MSI;
281			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
282			break;
283		case VFIO_PCI_INTX_IRQ_INDEX:
284			internal_config.vfio_intr_mode = RTE_INTR_MODE_LEGACY;
285			dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
286			break;
287		default:
288			RTE_LOG(ERR, EAL, "  unknown interrupt type!\n");
289			return -1;
290		}
291
292		return 0;
293	}
294
295	/* if we're here, we haven't found a suitable interrupt vector */
296	return -1;
297}
298
299/*
300 * map the PCI resources of a PCI device in virtual memory (VFIO version).
301 * primary and secondary processes follow almost exactly the same path
302 */
303int
304pci_vfio_map_resource(struct rte_pci_device *dev)
305{
306	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
307	char pci_addr[PATH_MAX] = {0};
308	int vfio_dev_fd;
309	struct rte_pci_addr *loc = &dev->addr;
310	int i, ret, msix_bar;
311	struct mapped_pci_resource *vfio_res = NULL;
312	struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
313
314	struct pci_map *maps;
315	uint32_t msix_table_offset = 0;
316	uint32_t msix_table_size = 0;
317	uint32_t ioport_bar;
318
319	dev->intr_handle.fd = -1;
320	dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
321
322	/* store PCI address string */
323	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
324			loc->domain, loc->bus, loc->devid, loc->function);
325
326	if ((ret = vfio_setup_device(pci_get_sysfs_path(), pci_addr,
327					&vfio_dev_fd, &device_info)))
328		return ret;
329
330	/* get MSI-X BAR, if any (we have to know where it is because we can't
331	 * easily mmap it when using VFIO) */
332	msix_bar = -1;
333	ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_bar,
334				    &msix_table_offset, &msix_table_size);
335	if (ret < 0) {
336		RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n", pci_addr);
337		close(vfio_dev_fd);
338		return -1;
339	}
340
341	/* if we're in a primary process, allocate vfio_res and get region info */
342	if (internal_config.process_type == RTE_PROC_PRIMARY) {
343		vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
344		if (vfio_res == NULL) {
345			RTE_LOG(ERR, EAL,
346				"%s(): cannot store uio mmap details\n", __func__);
347			close(vfio_dev_fd);
348			return -1;
349		}
350		memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
351
352		/* get number of registers (up to BAR5) */
353		vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
354				VFIO_PCI_BAR5_REGION_INDEX + 1);
355	} else {
356		/* if we're in a secondary process, just find our tailq entry */
357		TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
358			if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
359				continue;
360			break;
361		}
362		/* if we haven't found our tailq entry, something's wrong */
363		if (vfio_res == NULL) {
364			RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
365					pci_addr);
366			close(vfio_dev_fd);
367			return -1;
368		}
369	}
370
371	/* map BARs */
372	maps = vfio_res->maps;
373
374	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
375		struct vfio_region_info reg = { .argsz = sizeof(reg) };
376		void *bar_addr;
377		struct memreg {
378			unsigned long offset, size;
379		} memreg[2] = {};
380
381		reg.index = i;
382
383		ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
384
385		if (ret) {
386			RTE_LOG(ERR, EAL, "  %s cannot get device region info "
387					"error %i (%s)\n", pci_addr, errno, strerror(errno));
388			close(vfio_dev_fd);
389			if (internal_config.process_type == RTE_PROC_PRIMARY)
390				rte_free(vfio_res);
391			return -1;
392		}
393
394		/* chk for io port region */
395		ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
396			      VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
397			      + PCI_BASE_ADDRESS_0 + i*4);
398
399		if (ret != sizeof(ioport_bar)) {
400			RTE_LOG(ERR, EAL,
401				"Cannot read command (%x) from config space!\n",
402				PCI_BASE_ADDRESS_0 + i*4);
403			return -1;
404		}
405
406		if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) {
407			RTE_LOG(INFO, EAL,
408				"Ignore mapping IO port bar(%d) addr: %x\n",
409				 i, ioport_bar);
410			continue;
411		}
412
413		/* skip non-mmapable BARs */
414		if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
415			continue;
416
417		if (i == msix_bar) {
418			/*
419			 * VFIO will not let us map the MSI-X table,
420			 * but we can map around it.
421			 */
422			uint32_t table_start = msix_table_offset;
423			uint32_t table_end = table_start + msix_table_size;
424			table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
425			table_start &= PAGE_MASK;
426
427			if (table_start == 0 && table_end >= reg.size) {
428				/* Cannot map this BAR */
429				RTE_LOG(DEBUG, EAL, "Skipping BAR %d\n", i);
430				continue;
431			} else {
432				memreg[0].offset = reg.offset;
433				memreg[0].size = table_start;
434				memreg[1].offset = table_end;
435				memreg[1].size = reg.size - table_end;
436
437				RTE_LOG(DEBUG, EAL,
438					"Trying to map BAR %d that contains the MSI-X "
439					"table. Trying offsets: "
440					"0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", i,
441					memreg[0].offset, memreg[0].size,
442					memreg[1].offset, memreg[1].size);
443			}
444		} else {
445			memreg[0].offset = reg.offset;
446			memreg[0].size = reg.size;
447		}
448
449		/* try to figure out an address */
450		if (internal_config.process_type == RTE_PROC_PRIMARY) {
451			/* try mapping somewhere close to the end of hugepages */
452			if (pci_map_addr == NULL)
453				pci_map_addr = pci_find_max_end_va();
454
455			bar_addr = pci_map_addr;
456			pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
457		} else {
458			bar_addr = maps[i].addr;
459		}
460
461		/* reserve the address using an inaccessible mapping */
462		bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE |
463				MAP_ANONYMOUS, -1, 0);
464		if (bar_addr != MAP_FAILED) {
465			void *map_addr = NULL;
466			if (memreg[0].size) {
467				/* actual map of first part */
468				map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
469							    memreg[0].offset,
470							    memreg[0].size,
471							    MAP_FIXED);
472			}
473
474			/* if there's a second part, try to map it */
475			if (map_addr != MAP_FAILED
476			    && memreg[1].offset && memreg[1].size) {
477				void *second_addr = RTE_PTR_ADD(bar_addr, memreg[1].offset);
478				map_addr = pci_map_resource(second_addr,
479							    vfio_dev_fd, memreg[1].offset,
480							    memreg[1].size,
481							    MAP_FIXED);
482			}
483
484			if (map_addr == MAP_FAILED || !map_addr) {
485				munmap(bar_addr, reg.size);
486				bar_addr = MAP_FAILED;
487			}
488		}
489
490		if (bar_addr == MAP_FAILED ||
491				(internal_config.process_type == RTE_PROC_SECONDARY &&
492						bar_addr != maps[i].addr)) {
493			RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n", pci_addr, i,
494					strerror(errno));
495			close(vfio_dev_fd);
496			if (internal_config.process_type == RTE_PROC_PRIMARY)
497				rte_free(vfio_res);
498			return -1;
499		}
500
501		maps[i].addr = bar_addr;
502		maps[i].offset = reg.offset;
503		maps[i].size = reg.size;
504		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
505		dev->mem_resource[i].addr = bar_addr;
506	}
507
508	/* if secondary process, do not set up interrupts */
509	if (internal_config.process_type == RTE_PROC_PRIMARY) {
510		if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
511			RTE_LOG(ERR, EAL, "  %s error setting up interrupts!\n", pci_addr);
512			close(vfio_dev_fd);
513			rte_free(vfio_res);
514			return -1;
515		}
516
517		/* set bus mastering for the device */
518		if (pci_vfio_set_bus_master(vfio_dev_fd)) {
519			RTE_LOG(ERR, EAL, "  %s cannot set up bus mastering!\n", pci_addr);
520			close(vfio_dev_fd);
521			rte_free(vfio_res);
522			return -1;
523		}
524
525		/* Reset the device */
526		ioctl(vfio_dev_fd, VFIO_DEVICE_RESET);
527	}
528
529	if (internal_config.process_type == RTE_PROC_PRIMARY)
530		TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
531
532	return 0;
533}
534
535int
536pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
537		    struct rte_pci_ioport *p)
538{
539	if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
540	    bar > VFIO_PCI_BAR5_REGION_INDEX) {
541		RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar);
542		return -1;
543	}
544
545	p->dev = dev;
546	p->base = VFIO_GET_REGION_ADDR(bar);
547	return 0;
548}
549
550void
551pci_vfio_ioport_read(struct rte_pci_ioport *p,
552		     void *data, size_t len, off_t offset)
553{
554	const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
555
556	if (pread64(intr_handle->vfio_dev_fd, data,
557		    len, p->base + offset) <= 0)
558		RTE_LOG(ERR, EAL,
559			"Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n",
560			VFIO_GET_REGION_IDX(p->base), (int)offset);
561}
562
563void
564pci_vfio_ioport_write(struct rte_pci_ioport *p,
565		      const void *data, size_t len, off_t offset)
566{
567	const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
568
569	if (pwrite64(intr_handle->vfio_dev_fd, data,
570		     len, p->base + offset) <= 0)
571		RTE_LOG(ERR, EAL,
572			"Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n",
573			VFIO_GET_REGION_IDX(p->base), (int)offset);
574}
575
576int
577pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
578{
579	RTE_SET_USED(p);
580	return -1;
581}
582
583int
584pci_vfio_enable(void)
585{
586	return vfio_enable("vfio_pci");
587}
588
589int
590pci_vfio_is_enabled(void)
591{
592	return vfio_is_enabled("vfio_pci");
593}
594#endif
595