Bootstrap

基于MDEV的PCI设备虚拟化DEMO实现

利用周末时间做了一个MDEV虚拟化PCI设备的小试验,简单记录一下:

DEMO架构,此图参考了内核文档:Documentation/driver-api/vfio-mediated-device.rst

Demo 框架:参考如下文章中的受控直通方案,区别是由于实验中的watchdog是纯粹的模拟设备,包括BAR IO空间实际上对应的都是内存BUFFER,不需要PASS-TRHOUGH。

五分钟技术趣谈 | 浅谈GPU虚拟化

host kernel watchdog pci driver:

#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/uuid.h>
#include <linux/vfio.h>
#include <linux/iommu.h>
#include <linux/sysfs.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/mdev.h>
#include <linux/pci.h>

#define IO_BAR0_SIZE 32
#define IO_CONF_SIZE 0x100
#define CZL_WDG_DEVICE_VENDOR_ID 0xbeef
#define CZL_WDG_DEVICE_DEVICE_ID 0x1001
#define API_DBG(fmt, ...) do { \
                printk("%s line %d, "fmt, __func__, __LINE__, ##__VA_ARGS__); \
        } while (0)

struct czl_wdg_dev {
	dev_t         wdg_devt;
	struct class *wdg_class;
	struct cdev   wdg_cdev;
	struct device dev;
};

struct mdev_region_info {
	u64 start;
	u64 phys_start;
	u32 size;
	u64 vfio_offset;
};

struct wdg_mdev_state {
	u8 *config;
	u8 *iobase;
	struct mdev_device *mdev;
	struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];
	u32 bar_mask[VFIO_PCI_NUM_REGIONS];
	struct list_head next;
	struct vfio_device_info dev_info;
	int index;
	struct mutex ops_lock;
};

static const struct file_operations czl_wdg_fops = {
	.owner          = THIS_MODULE,
};

static struct mutex wdg_mdev_list_lock;
static struct list_head wdg_mdev_devices_list;
#define WDG_VFIO_PCI_OFFSET_SHIFT   (40)
#define WDG_VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> WDG_VFIO_PCI_OFFSET_SHIFT)
#define WDG_VFIO_PCI_INDEX_TO_OFFSET(index) \
                                        ((u64)(index) << WDG_VFIO_PCI_OFFSET_SHIFT)
#define WDG_VFIO_PCI_OFFSET_MASK    \
                                (((u64)(1) << WDG_VFIO_PCI_OFFSET_SHIFT) - 1)
#define MAX_WDGS                    (16)
static struct czl_wdg_dev czl_wdg;

static ssize_t
czl_wdg_dev_show(struct device *dev, struct device_attribute *attr,
                 char *buf)
{
	return sprintf(buf, "mdev emulated pci watchdog device by caozilong.\n");
}
static DEVICE_ATTR_RO(czl_wdg_dev);

static struct attribute *wdg_dev_attrs[] = {
	&dev_attr_czl_wdg_dev.attr,
	NULL,
};

static const struct attribute_group wdg_dev_group = {
	.name  = "czl_wdg",
	.attrs = wdg_dev_attrs,
};

static const struct attribute_group *wdg_dev_groups[] = {
	&wdg_dev_group,
	NULL,
};


static ssize_t
mdev_dev_show(struct device *dev, struct device_attribute *attr,
              char *buf)
{
	if (mdev_from_dev(dev)) {
		return sprintf(buf, "This is watchdog %s\n", dev_name(dev));
	}

	return sprintf(buf, "\n");
}

static DEVICE_ATTR_RO(mdev_dev);

static struct attribute *mdev_dev_attrs[] = {
	&dev_attr_mdev_dev.attr,
	NULL,
};

static const struct attribute_group mdev_dev_group = {
	.name  = "caozilong",
	.attrs = mdev_dev_attrs,
};

static const struct attribute_group *mdev_dev_groups[] = {
	&mdev_dev_group,
	NULL,
};


static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
{
	int i;
	char name[128];
	const char *name_str[3] = {"Soft Watchdog", "Hardware Watchdog", "Dummy Watchdog"};

	for (i = 0; i < 3; i++) {
		snprintf(name, 128, "%s-%d", dev_driver_string(dev), i + 1);
		if (!strcmp(kobj->name, name)) {
			return sprintf(buf, "%s\n", name_str[i]);
		}
	}

	return -EINVAL;
}

static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
                               char *buf)
{
	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
}

static ssize_t
available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
{
	struct wdg_mdev_state *mds;
	int used = 0;

	list_for_each_entry(mds, &wdg_mdev_devices_list, next) {
		used ++;
	}

	return sprintf(buf, "%d\n", (MAX_WDGS - used));
}

static MDEV_TYPE_ATTR_RO(name);
static MDEV_TYPE_ATTR_RO(device_api);
static MDEV_TYPE_ATTR_RO(available_instances);

static struct attribute *mdev_types_attrs[] = {
	&mdev_type_attr_name.attr,
	&mdev_type_attr_device_api.attr,
	&mdev_type_attr_available_instances.attr,
	NULL,
};

static struct attribute_group mdev_type_group1 = {
	.name  = "1",
	.attrs = mdev_types_attrs,
};

static struct attribute_group mdev_type_group2 = {
	.name  = "2",
	.attrs = mdev_types_attrs,
};

static struct attribute_group mdev_type_group3 = {
	.name  = "3",
	.attrs = mdev_types_attrs,
};

static struct attribute_group *mdev_type_groups[] = {
	&mdev_type_group1,
	&mdev_type_group2,
	&mdev_type_group3,
	NULL,
};

static int czl_wdg_open(struct mdev_device *mdev)
{
	pr_info("%s line %d, wdg device opened.\n",
	        __func__, __LINE__);
	return 0;
}

static void czl_wdg_close(struct mdev_device *mdev)
{
	pr_info("%s line %d, wdg device close.\n",
	        __func__, __LINE__);
	return;
}

// fill pci config space meta data & capabilities.
int wdg_create_config_space(struct wdg_mdev_state *mstate)
{
	// vendor id, device id.
	*((unsigned int *)&mstate->config[0]) = CZL_WDG_DEVICE_VENDOR_ID |
	                                        (CZL_WDG_DEVICE_DEVICE_ID << 16);
	*((unsigned short *)&mstate->config[4]) = 0x0001;
	*((unsigned short *)&mstate->config[6]) = 0x0200;

	mstate->config[0x8] =  0x10;
	mstate->config[0x9] =  0x02;
	mstate->config[0xa] =  0x00;
	mstate->config[0xb] =  0x07;

	*((unsigned int *)&mstate->config[0x10]) = 0x000001;
	mstate->bar_mask[0] = ~(IO_BAR0_SIZE) + 1;
	*((unsigned int *)&mstate->config[0x2c]) = 0x10011af4;

	// cap ptr.
	mstate->config[0x34] =  0x00;
	mstate->config[0x3d] =  0x01;
	mstate->config[0x40] =  0x23;
	mstate->config[0x43] =  0x80;
	mstate->config[0x44] =  0x23;
	mstate->config[0x48] =  0x23;
	mstate->config[0x4c] =  0x23;
	mstate->config[0x60] =  0x50;

	mstate->config[0x61] =  0x43;
	mstate->config[0x62] =  0x49;
	mstate->config[0x63] =  0x20;
	mstate->config[0x64] =  0x53;
	mstate->config[0x65] =  0x65;
	mstate->config[0x66] =  0x72;
	mstate->config[0x67] =  0x69;
	mstate->config[0x68] =  0x61;
	mstate->config[0x69] =  0x6c;
	mstate->config[0x6a] =  0x2f;
	mstate->config[0x6b] =  0x55;
	mstate->config[0x6c] =  0x41;
	mstate->config[0x6d] =  0x52;
	mstate->config[0x6e] =  0x54;

	return 0;
}

static int czl_wdg_create(struct kobject *kobj, struct mdev_device *mdev)
{
	int i;
	struct wdg_mdev_state *mstate;
	char name[32];

	if (!mdev)
		return -EINVAL;

	for (i = 0; i < 3; i++) {
		snprintf(name, 32, "%s-%d", dev_driver_string(mdev_parent_dev(mdev)), i + 1);
		if (!strcmp(kobj->name, name)) {
			break;
		}
	}

	if (i >= 3) {
		return -EINVAL;
	}

	mstate = kzalloc(sizeof(struct wdg_mdev_state), GFP_KERNEL);
	if (mstate == NULL)
		return -ENOMEM;
	// group number in mdev_type.
	mstate->index = i + 1;
	mstate->config = kzalloc(IO_CONF_SIZE, GFP_KERNEL);
	if (mstate->config == NULL) {
		pr_err("%s line %d, alloc pci config buffer failure.\n",
		       __func__, __LINE__);
		kfree(mstate);
		return -ENOMEM;
	}

	mstate->iobase = kzalloc(IO_BAR0_SIZE, GFP_KERNEL);
	if (mstate->iobase == NULL) {
		pr_err("%s line %d, alloc pci io buffer failure.\n",
		       __func__, __LINE__);
		kfree(mstate->config);
		kfree(mstate);
		return -ENOMEM;
	}

	memset(mstate->config, 0x00, IO_CONF_SIZE);

	mutex_init(&mstate->ops_lock);
	mstate->mdev = mdev;
	mdev_set_drvdata(mdev, mstate);
	wdg_create_config_space(mstate);

	mutex_lock(&wdg_mdev_list_lock);
	list_add(&mstate->next, &wdg_mdev_devices_list);
	mutex_unlock(&wdg_mdev_list_lock);

	return 0;
}

static int czl_wdg_remove(struct mdev_device *mdev)
{
	struct wdg_mdev_state *mds, *tmp_mds;
	struct wdg_mdev_state *mstate = mdev_get_drvdata(mdev);

	int ret = -EINVAL;

	mutex_lock(&wdg_mdev_list_lock);
	list_for_each_entry_safe(mds, tmp_mds, &wdg_mdev_devices_list, next) {
		if (mstate == mds) {
			list_del(&mstate->next);
			mdev_set_drvdata(mdev, NULL);
			kfree(mstate->config);
			kfree(mstate->iobase);
			kfree(mstate);
			ret = 0;
			break;
		}
	}
	mutex_unlock(&wdg_mdev_list_lock);

	return ret;
}

static void handle_pci_cfg_space_write(struct wdg_mdev_state *mstate, u16 offset,
                                       u8 *buf, u32 count)
{
	u32 cfg_addr, bar_mask;

	switch (offset) {
	case 0x04: /* device control */
	case 0x06: /* device status */
		// do nothing
		break;
	case 0x3c:
		mstate->config[0x3c] = buf[0];
		break;
	case 0x3d:
		break;
	case 0x10:  /* BAR0 */
		cfg_addr = *(u32 *)buf;
		pr_info("BAR0 addr 0x%x\n", cfg_addr);
		if (cfg_addr == 0xffffffff) {
			bar_mask = mstate->bar_mask[0];
			cfg_addr = (cfg_addr & bar_mask);
		}
		cfg_addr |= (mstate->config[offset] & 0x3ul);
		*((unsigned int *)&mstate->config[offset]) = cfg_addr;
		break;
	case 0x14:  /* BAR1 */
	case 0x18:  /* BAR2 */
	case 0x20:  /* BAR4 */
		*((unsigned int *)&mstate->config[offset]) = 0;
		break;
	default:
		pr_info("PCI config write @0x%x of %d bytes not handled\n",
		        offset, count);
		break;

	}

	return;
}

static void handle_pci_cfg_space_read(struct wdg_mdev_state *mstate, u16 offset,
                                      u8 *buf, u32 count)
{
	memcpy(buf, (mstate->config + offset), count);
	return;
}

static void mdev_read_base(struct wdg_mdev_state *mstate)
{
	int index, pos;
	u32 start_lo, start_hi;
	u32 mem_type;

	pos = PCI_BASE_ADDRESS_0;
	for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++)  {
		if (!mstate->region_info[index].size)
			continue;
		start_lo = (*(u32 *)(mstate->config + pos)) &
		           PCI_BASE_ADDRESS_MEM_MASK;
		mem_type = (*(u32 *)(mstate->config + pos)) &
		           PCI_BASE_ADDRESS_MEM_TYPE_MASK;

		switch (mem_type) {
		case PCI_BASE_ADDRESS_MEM_TYPE_64:
			start_hi = (*(u32 *)(mstate->config + pos + 4));
			pos += 4;
			break;
		case PCI_BASE_ADDRESS_MEM_TYPE_32:
		case PCI_BASE_ADDRESS_MEM_TYPE_1M:
		default:
			start_hi = 0;
			break;
		}
		pos += 4;
		mstate->region_info[index].start = ((u64)start_hi << 32) | start_lo;
	}

	return;
}

static void handle_bar_write(unsigned int index, struct wdg_mdev_state *mstate,
                             u16 offset, u8 *buf, u32 count)
{
	pr_info("%s line %d, bar %d, write offset 0x%x, count 0x%x, val 0x%x.\n",
	        __func__, __LINE__, index, offset, count, *buf);
	memcpy(mstate->iobase + offset, buf, count);
	return;
}

static void handle_bar_read(unsigned int index, struct wdg_mdev_state *mstate,
                            u16 offset, u8 *buf, u32 count)
{
	pr_info("%s line %d, bar %d, read offset 0x%x, count 0x%x, val 0x%x.\n",
	        __func__, __LINE__, index, offset, count, *buf);
	memcpy(buf, mstate->iobase + offset, count);
	return;
}

static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
                           loff_t pos, bool is_write)
{
	int ret = 0;
	unsigned int index;
	loff_t offset;
	struct wdg_mdev_state *mstate;

	if (!mdev || !buf)
		return -EINVAL;

	mstate = mdev_get_drvdata(mdev);
	if (!mstate) {
		pr_err("%s line %d. get mstate failure.\n", __func__, __LINE__);
		return -EINVAL;
	}

	mutex_lock(&mstate->ops_lock);
	index = WDG_VFIO_PCI_OFFSET_TO_INDEX(pos);
	offset = pos & WDG_VFIO_PCI_OFFSET_MASK;
	switch (index) {
	case VFIO_PCI_CONFIG_REGION_INDEX:
		pr_info("%s: PCI config space %s at offset 0x%llx\n",
		        __func__, is_write ? "write" : "read", offset);
		if (is_write) {
			handle_pci_cfg_space_write(mstate, offset, buf, count);
		} else {
			handle_pci_cfg_space_read(mstate, offset, buf, count);
		}
		break;
	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
		if (!mstate->region_info[index].start)
			mdev_read_base(mstate);
		if (is_write) {
			pr_info("%s: write bar%d offset 0x%llx, val 0x%x.\n",
			        __func__, index, offset, *buf);
			handle_bar_write(index, mstate, offset, buf, count);
		} else {
			pr_info("%s: read bar%d offset 0x%llx, val 0x%x.\n",
			        __func__, index, offset, *buf);
			handle_bar_read(index, mstate, offset, buf, count);
		}
		break;
	default:
		ret = -1;
		goto failed;
	}

	ret = count;

failed:
	mutex_unlock(&mstate->ops_lock);

	return ret;
}

static ssize_t czl_wdg_read(struct mdev_device *mdev, char __user *buf,
                            size_t count, loff_t *ppos)
{
	unsigned int done = 0;
	int ret;

	pr_info("%s line %d, read count 0x%lx, pos 0x%llx.\n", __func__, __LINE__, count, *ppos);
	while (count) {
		size_t filled;

		if (count >= 4 && !(*ppos % 4)) {
			u32 val;

			ret =  mdev_access(mdev, (u8 *)&val, sizeof(val),
			                   *ppos, false);
			if (ret <= 0)
				goto read_err;
			if (copy_to_user(buf, &val, sizeof(val)))
				goto read_err;
			filled = 4;
		} else if (count >= 2 && !(*ppos % 2)) {
			u16 val;
			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
			                  *ppos, false);
			if (ret <= 0)
				goto read_err;
			if (copy_to_user(buf, &val, sizeof(val)))
				goto read_err;
			filled = 2;
		} else {
			u8 val;

			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
			                  *ppos, false);
			if (ret <= 0)
				goto read_err;
			if (copy_to_user(buf, &val, sizeof(val)))
				goto read_err;
			filled = 1;
		}
		count -= filled;
		done += filled;
		*ppos += filled;
		buf += filled;
	}

	pr_info("%s line %d, read count 0x%x.\n", __func__, __LINE__, done);
	return done;

read_err:
	pr_err("%s line %d, read err happend.\n", __func__, __LINE__);
	return -EFAULT;
}

static ssize_t czl_wdg_write(struct mdev_device *mdev, const char __user *buf,
                             size_t count, loff_t *ppos)
{
	unsigned int done = 0;
	int ret;

	pr_info("%s line %d, write count 0x%lx, pos 0x%llx.\n", __func__, __LINE__, count, *ppos);
	while (count) {
		size_t filled;

		if (count >= 4 && !(*ppos % 4)) {
			u32 val;

			if (copy_from_user(&val, buf, sizeof(val)))
				goto write_err;

			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
			                  *ppos, true);
			if (ret <= 0)
				goto write_err;
			filled = 4;
		}  else if (count >= 2 && !(*ppos % 2)) {
			u16 val;

			if (copy_from_user(&val, buf, sizeof(val)))
				goto write_err;
			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
			                  *ppos, true);
			if (ret <= 0)
				goto write_err;
			filled = 2;
		} else {
			u8 val;

			if (copy_from_user(&val, buf, sizeof(val)))
				goto write_err;
			ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
			                  *ppos, true);
			if (ret <= 0)
				goto write_err;
			filled = 1;
		}
		count -= filled;
		done += filled;
		*ppos += filled;
		buf += filled;
	}

	pr_info("%s line %d, write count 0x%x.\n", __func__, __LINE__, done);
	return done;

write_err:
	pr_err("%s line %d, write failure.\n", __func__, __LINE__);
	return -EFAULT;
}

static int wdg_get_device_info(struct mdev_device *mdev, struct vfio_device_info *dev_info)
{
	dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
	dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
	dev_info->num_irqs = VFIO_PCI_NUM_IRQS;

	return 0;
}

static int wdg_get_region_info(struct mdev_device *mdev, struct vfio_region_info *region_info)
{
	unsigned int size = 0;
	struct wdg_mdev_state *mstate;
	u32 bar_index;

	if (!mdev) {
		pr_err("%s line %d,mdev is null.\n", __func__, __LINE__);
		return -EINVAL;
	}

	mstate = mdev_get_drvdata(mdev);
	if (!mstate) {
		pr_err("%s line %d,mstat is null.\n", __func__, __LINE__);
		return -EINVAL;
	}

	bar_index = region_info->index;
	if (bar_index >= VFIO_PCI_NUM_REGIONS) {
		pr_err("%s line %d,bar index %d exceeds.\n", __func__, __LINE__, bar_index);
		return -EINVAL;
	}

	mutex_lock(&mstate->ops_lock);
	switch (bar_index) {
	case VFIO_PCI_CONFIG_REGION_INDEX:
		size = IO_CONF_SIZE;
		break;
	case VFIO_PCI_BAR0_REGION_INDEX:
		size = IO_BAR0_SIZE;
		break;
	default:
		size = 0;
		break;
	}

	mstate->region_info[bar_index].size = size;
	mstate->region_info[bar_index].vfio_offset =
	        WDG_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
	region_info->size = size;
	region_info->offset = WDG_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
	region_info->flags = VFIO_REGION_INFO_FLAG_READ |
	                     VFIO_REGION_INFO_FLAG_WRITE;

	mutex_unlock(&mstate->ops_lock);

	return 0;
}

static int wdg_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info)
{
	switch (irq_info->index) {
	case VFIO_PCI_INTX_IRQ_INDEX:
	case VFIO_PCI_MSI_IRQ_INDEX:
	case VFIO_PCI_REQ_IRQ_INDEX:
		break;
	default:
		pr_err("%s line %d, irq idx %d is invalid.\n",
		       __func__, __LINE__, irq_info->index);
		return -EINVAL;
	}

	irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
	irq_info->count = 1;
	if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)
		irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE |
		                    VFIO_IRQ_INFO_AUTOMASKED);
	else
		irq_info->flags |= VFIO_IRQ_INFO_NORESIZE;

	return 0;
}

static long czl_wdg_ioctl(struct mdev_device *mdev, unsigned int cmd,
                          unsigned long arg)
{
	int ret = 0;
	unsigned long minsz;
	struct wdg_mdev_state *mstate;

	pr_info("czl wdg ioctl enter.\n");

	if (!mdev) {
		pr_err("%s line %d, mdev is null.\n", __func__, __LINE__);
		return -EINVAL;
	}

	mstate = mdev_get_drvdata(mdev);
	if (!mstate) {
		pr_err("%s line %d, cant find mstate data.\n", __func__, __LINE__);
		return -ENODEV;
	}

	switch (cmd) {
	case VFIO_DEVICE_GET_INFO: {
		struct vfio_device_info info;
		minsz = offsetofend(struct vfio_device_info, num_irqs);

		if (copy_from_user(&info, (void __user *)arg, minsz))
			return -EFAULT;
		if (info.argsz < minsz) {
			pr_err("%s line %d, info.argsz %d < minsz %ld.\n",
			       __func__, __LINE__, info.argsz, minsz);
			return -EINVAL;
		}

		ret = wdg_get_device_info(mdev, &info);
		if (ret) {
			pr_err("%s line %d, get device info failure.\n", __func__, __LINE__);
			return ret;
		}
		memcpy(&mstate->dev_info, &info, sizeof(info));
		if (copy_to_user((void __user *)arg, &info, minsz))
			return -EFAULT;
		return 0;
	}
	case VFIO_DEVICE_GET_REGION_INFO: {
		struct vfio_region_info info;

		minsz = offsetofend(struct vfio_region_info, offset);

		if (copy_from_user(&info, (void __user *)arg, minsz))
			return -EFAULT;
		if (info.argsz < minsz) {
			pr_err("%s line %d, info.argsz %d < minsz %ld.\n",
			       __func__, __LINE__, info.argsz, minsz);
			return -EINVAL;
		}

		ret = wdg_get_region_info(mdev, &info);
		if (ret) {
			pr_err("%s line %d, get region info failure.\n", __func__, __LINE__);
			return ret;
		}

		if (copy_to_user((void __user *)arg, &info, minsz))
			return -EFAULT;
		return 0;
	}
	case VFIO_DEVICE_GET_IRQ_INFO: {
		struct vfio_irq_info info;

		minsz = offsetofend(struct vfio_irq_info, count);
		if (copy_from_user(&info, (void __user *)arg, minsz))
			return -EFAULT;
		if ((info.argsz < minsz) ||
		    (info.index >= mstate->dev_info.num_irqs))
			return -EINVAL;
		ret = wdg_get_irq_info(mdev, &info);
		if (ret)
			return ret;
		if (copy_to_user((void __user *)arg, &info, minsz))
			return -EFAULT;
		return 0;
	}
	case VFIO_DEVICE_SET_IRQS: {
		pr_info("%s line %d, set irqs.\n", __func__, __LINE__);
		return 0;
	}
	case VFIO_DEVICE_RESET:
		pr_info("%s line %d, reset.\n", __func__, __LINE__);
		return 0;
	}

	return -EINVAL;
}

static const struct mdev_parent_ops wdg_mdev_fops = {
	.owner                  = THIS_MODULE,
	.dev_attr_groups        = wdg_dev_groups,
	.mdev_attr_groups       = mdev_dev_groups,
	.supported_type_groups  = mdev_type_groups,
	.create                 = czl_wdg_create,
	.remove                 = czl_wdg_remove,
	.open                   = czl_wdg_open,
	.release                = czl_wdg_close,
	.read                   = czl_wdg_read,
	.write                  = czl_wdg_write,
	.ioctl                  = czl_wdg_ioctl,
};

static void wdg_device_release(struct device *dev)
{
	pr_info("czl wdg devide release.\n");
}

static int mdev_wdg_init(void)
{
	int ret = 0;

	pr_info("czl wdg init.\n");

	memset(&czl_wdg, 0x00, sizeof(czl_wdg));

	ret = alloc_chrdev_region(&czl_wdg.wdg_devt, 0, MINORMASK + 1, "czl_wdg");
	if (ret < 0) {
		pr_err("error: failed to register czl wdg device, err:%d\n", ret);
		return -1;
	}

	cdev_init(&czl_wdg.wdg_cdev, &czl_wdg_fops);
	cdev_add(&czl_wdg.wdg_cdev, czl_wdg.wdg_devt, MINORMASK + 1);

	pr_info("major_number:%d\n", MAJOR(czl_wdg.wdg_devt));

	czl_wdg.wdg_class = class_create(THIS_MODULE, "czl_wdg");
	if (IS_ERR(czl_wdg.wdg_class)) {
		pr_err("error: failed to create wdg class.\n");
		ret = -1;
		goto failed1;
	}

	czl_wdg.dev.class = czl_wdg.wdg_class;
	czl_wdg.dev.release = wdg_device_release;
	dev_set_name(&czl_wdg.dev, "%s", "czl_wdg");
	ret = device_register(&czl_wdg.dev);
	if (ret) {
		pr_err("%s line %d, register wdg device failure.\n", __func__, __LINE__);
		ret = -1;
		goto  failed2;
	}

	ret = mdev_register_device(&czl_wdg.dev, &wdg_mdev_fops);
	if (ret) {
		pr_err("%s line %d, register wdg mdev device failure.\n", __func__, __LINE__);
		ret = -1;
		goto  failed3;
	}

	mutex_init(&wdg_mdev_list_lock);
	INIT_LIST_HEAD(&wdg_mdev_devices_list);

	pr_info("czl wdg init success.\n");
	goto done;
failed3:
	device_unregister(&czl_wdg.dev);
failed2:
	class_destroy(czl_wdg.wdg_class);
failed1:
	cdev_del(&czl_wdg.wdg_cdev);
	unregister_chrdev_region(czl_wdg.wdg_devt, MINORMASK + 1);
done:
	return ret;
}

static void mdev_wdg_exit(void)
{
	czl_wdg.dev.bus = NULL;
	mdev_unregister_device(&czl_wdg.dev);
	device_unregister(&czl_wdg.dev);
	cdev_del(&czl_wdg.wdg_cdev);
	unregister_chrdev_region(czl_wdg.wdg_devt, MINORMASK + 1);
	class_destroy(czl_wdg.wdg_class);
	czl_wdg.wdg_class = NULL;

	pr_info("czl_wdg_unload.\n");
	return;
}

module_init(mdev_wdg_init)
module_exit(mdev_wdg_exit)
MODULE_LICENSE("GPL v2");

qemu virtual machine kernel  watchdog pci driver

#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/uuid.h>
#include <linux/vfio.h>
#include <linux/iommu.h>
#include <linux/sysfs.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/mdev.h>
#include <linux/pci.h>
#include <linux/idr.h>

static int devno;
static DEFINE_IDR(wdg_minors);
static DEFINE_MUTEX(wdg_minors_lock);
#define WDG_MINORS_COUNT 256

struct wdg_pci_state {
	struct pci_dev *pdev;
	struct device *dev;
	int iobase;
	int iolen;
	int major;
	int minor;
};

static struct class *wdg_class;
static const struct pci_device_id czl_pci_table[] = {
	{       PCI_DEVICE(0xbeef, 0x1001),       },
	{ 0,                                      }
};

static int czl_wdg_open(struct inode *inode, struct file *file)
{
	int rc = 0;
	int major, minor;

	major = imajor(inode);
	minor = iminor(inode);
	mutex_lock(&wdg_minors_lock);
	file->private_data = idr_find(&wdg_minors, minor);
	mutex_unlock(&wdg_minors_lock);
	if (!file->private_data) {
		pr_err("%s line %d, cant find wdg structure.\n",
		       __func__, __LINE__);
		rc = -1;
	}

	return rc;
}

static int czl_wdg_release(struct inode *inode, struct file *file)
{
	return 0;
}

ssize_t czl_wdg_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
	int i;
	struct wdg_pci_state *wdgdev = NULL;
	unsigned char *kbuf = NULL;
	int actuallen = 0;

	wdgdev = file->private_data;
	if (!wdgdev) {
		pr_err("%s line %d, read failure.\n", __func__, __LINE__);
		return -1;
	}

	if (*ppos > wdgdev->iolen) {
		pr_err("%s line %d, read pos %lld exceed max io len %d.\n",
		       __func__, __LINE__, *ppos, wdgdev->iolen);
		return -1;
	}

	kbuf = kzalloc(GFP_KERNEL, size);
	if (kbuf == NULL) {
		pr_err("%s line %d, alloc kbuf failure.\n",
		       __func__, __LINE__);
		return -1;
	}

	for (i = 0; (i < size) && ((*ppos + i) <  wdgdev->iolen); i++) {
		kbuf[i] = inb(wdgdev->iobase + *ppos + i);
		actuallen ++;
	}

	copy_to_user(buf, kbuf, actuallen);
	kfree(kbuf);
	return actuallen;
}

static ssize_t czl_wdg_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
	int i;
	struct wdg_pci_state *wdgdev = NULL;
	unsigned char *kbuf = NULL;
	int actuallen = 0;

	wdgdev = file->private_data;
	if (!wdgdev) {
		pr_err("%s line %d, read failure.\n", __func__, __LINE__);
		return -1;
	}

	if (*ppos > wdgdev->iolen) {
		pr_err("%s line %d, read pos %lld exceed max io len %d.\n",
		       __func__, __LINE__, *ppos, wdgdev->iolen);
		return -1;
	}

	kbuf = kzalloc(GFP_KERNEL, count);
	if (kbuf == NULL) {
		pr_err("%s line %d, alloc kbuf failure.\n",
		       __func__, __LINE__);
		return -1;
	}

	copy_from_user(kbuf, buf, count);

	for (i = 0; (i < count) && ((*ppos + i) <  wdgdev->iolen); i++) {
		outb((u8)kbuf[i], wdgdev->iobase + *ppos + i);
		actuallen ++;
	}

	kfree(kbuf);
	return actuallen;
}

static const struct file_operations czl_wdg_fops = {
	.owner          = THIS_MODULE,
	.open           = czl_wdg_open,
	.release        = czl_wdg_release,
	.read           = czl_wdg_read,
	.write          = czl_wdg_write,
};

static char *wdg_devnode(struct device *dev, umode_t *mode)
{
	if (mode)
		*mode = 06666;
	return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
}

static int wdg_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
	struct wdg_pci_state *wdgdev = NULL;

	pr_info("%s line %d, wdg pci device & driver binding.\n", __func__, __LINE__);

	wdgdev = kzalloc(GFP_KERNEL, sizeof(*wdgdev));
	if (!wdgdev) {
		pr_err("%s line %d, fail to alloc buffer.\n",
		       __func__, __LINE__);
		goto err0;
	}

	wdgdev->major = devno;

	wdgdev->pdev = pci_dev_get(pdev);
	wdgdev->iobase = pci_resource_start(pdev, 0);
	wdgdev->iolen = pci_resource_len(pdev, 0);
	mutex_lock(&wdg_minors_lock);
	wdgdev->minor = idr_alloc(&wdg_minors, wdgdev, 0, WDG_MINORS_COUNT, GFP_KERNEL);
	mutex_unlock(&wdg_minors_lock);
	if (wdgdev->minor < 0) {
		pr_err("%s line %d, get minor failure from idr.\n", __func__, __LINE__);
		goto err1;
	}

	pr_info("%s line %d, major %d, minor %d, iobase 0x%x.\n", __func__, __LINE__,
	        devno, wdgdev->minor, wdgdev->iobase);
	wdgdev->dev = device_create(wdg_class, NULL, MKDEV(devno, wdgdev->minor),
	                            NULL, "czl-wdg-%d", wdgdev->minor);
	if (!wdgdev->dev || IS_ERR(wdgdev->dev)) {
		pr_err("%s line %d, create wdg device failure.\n",
		       __func__, __LINE__);
		goto err2;
	}

	pci_set_drvdata(pdev, wdgdev);
	return 0;
err2:
	idr_remove(&wdg_minors, wdgdev->minor);
err1:
	if (wdgdev) {
		kfree(wdgdev);
	}
err0:
	return -1;
}

static void wdg_pci_remove(struct pci_dev *pdev)
{
	struct wdg_pci_state *wdgdev;

	pr_info("%s line %d, wdg pci device & driver removing.\n", __func__, __LINE__);

	wdgdev = pci_get_drvdata(pdev);
	pci_set_drvdata(pdev, NULL);
	pci_dev_put(pdev);
	wdgdev->pdev = NULL;
	device_destroy(wdg_class, MKDEV(devno, wdgdev->minor));
	idr_remove(&wdg_minors, wdgdev->minor);
	kfree(wdgdev);

	return;
}

static struct pci_driver czl_wdg_driver = {
	.name           = "czl-mdev-wdg",
	.id_table       = czl_pci_table,
	.probe          = wdg_pci_probe,
	.remove         = wdg_pci_remove,
};
static int czl_wdg_init(void)
{
	int ret;

	wdg_class = class_create(THIS_MODULE, "czl-wdg");
	if (!wdg_class) {
		pr_err("%s line %d, create watchdog class failure.\n",
		       __func__, __LINE__);
		return -1;
	}

	wdg_class->devnode = wdg_devnode;

	devno = register_chrdev(0, "czl-wdg", &czl_wdg_fops);
	if (devno < 0) {
		pr_err("%s line %d, register wdg device chrno failure.\n",
		       __func__, __LINE__);
		class_destroy(wdg_class);
		return -1;
	}

	ret = pci_register_driver(&czl_wdg_driver);

	return ret;
}

static void czl_wdg_exit(void)
{
	pci_unregister_driver(&czl_wdg_driver);
	unregister_chrdev(devno, "czl-wdg");
	class_destroy(wdg_class);
	idr_destroy(&wdg_minors);
	return;
}

module_init(czl_wdg_init)
module_exit(czl_wdg_exit)
MODULE_LICENSE("GPL v2");

qemu virtual machine user space wdt test case

#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdarg.h>

void dump_buf(unsigned char *buf, int len)
{
	int i;

	for (i = 0; i < len; i++) {
		if (i % 16 == 0)
			printf("\n0x%04x: ", i);
		printf("0x%02x ", buf[i]);
	}

	printf("\n");
	return;
}

int main(void)
{
	int wdgfd;
	int status;
	unsigned char buf[32];

	wdgfd = open("/dev/czl-wdg-0", O_RDWR);
	if (wdgfd < 0) {
		printf("%s line %d, open failure.\n",
		       __func__, __LINE__);
		return -1;
	}

	while (1) {
		memset(buf, 0x00, 32);

		status = read(wdgfd, buf, 32);
		if (status < 0) {
			printf("%s line %d, read failure.\n",
			       __func__, __LINE__);
			return -1;
		}

		printf("%s line %d, read %d.\n", __func__, __LINE__, status);

		dump_buf(buf, 32);

		memset(buf, 0x5a, 32);
		lseek(wdgfd, 0, SEEK_SET);
		status = write(wdgfd, buf, 32);
		if (status < 0) {
			printf("%s line %d, read failure.\n",
			       __func__, __LINE__);
			return -1;
		}
		printf("%s line %d, read %d.\n", __func__, __LINE__, status);

		sleep(1);
	}

	close(wdgfd);
	return 0;
}

测试程序构成:

测试过程:

1.安装WDG MDEV驱动:

sudo insmod mdev.ko
sudo insmod vfio_mdev.ko
sudo insmod czl-mdev-wdg.ko

测试主机默认内核的VFIO驱动是BUILT IN到KERNEL中的,如果重新编译内核,并且相关驱动编译为模块,可以看到MDEV实际上是基于VFIO驱动的:

host端的模块依赖关系可以用下图表示:

2.创建mdev设备

创建两个mdev设备

echo "f422fd86-35c0-11ef-8e50-9342c1138a56" > /sys/devices/virtual/czl_wdg/czl_wdg/mdev_supported_types/czl_wdg-1/create
echo "c04de378-35d8-11ef-95c3-339660dfc874" > /sys/devices/virtual/czl_wdg/czl_wdg/mdev_supported_types/czl_wdg-2/create

3.将第二步创建的mdev设别透传给QEMU虚拟机启动:

qemu-system-x86_64 -m 4096 -smp 4 --enable-kvm -drive file=/home/zlcao/Workspace/iso/ps.img -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/f422fd86-35c0-11ef-8e50-9342c1138a56 -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/c04de378-35d8-11ef-95c3-339660dfc874

系统启动后,可以看到虚拟机环境下出现了透传的MDEV PCI设备,设备vendor/device id为0xbeef1001,符合代码设定。

4.虚拟机内安装WDG PCI设备驱动:

上图中可以看到,两个透传的MDEV设备已经和一个名为"serial"的PCI设备驱动绑定,这并不符合预期,需要将默认的"serial"驱动和MDEV设备解绑,在QEMU虚拟机控制台中输入如下命令解绑驱动:

echo -n 0000:00:04.0 > /sys/bus/pci/drivers/serial/unbind
echo -n 0000:00:05.0 > /sys/bus/pci/drivers/serial/unbind

之后就可以安装我们的WDG PCI驱动了:

sudo insmod czl-mdev-drv.ko

安装成功后,虚拟机设备目录下出现了WDG PCI的设备节点:

此时,两个MDEV PCI设备也显示绑定到了正确的驱动:

为何创建的PCI设备绑定了"serial" pci driver?

serial_pci_driver定义在内核文件linux-5.4.260/drivers/tty/serial/8250/8250_pci.c中

PCI设备驱动探测的时候绑定规则是检测设备的VENDROR ID和DEVICE ID是否和驱动给定的过滤器匹配,巧的是,serial_pci_driver驱动的serial_pci_tbl过滤器包含的一个映射规则是PCI_ANY_ID,也就是说,serial_pci_driver可以和任意的PCI设备绑定,所以才会出现创建的vWDG设备和serial驱动绑定的情况,虽然绑定了驱动,但是由于vWDG是我们自定义的设备,默认的serial是无法驱动的,所以必须卸载。

并且发现一个很有意思的现象,当只有BAR0 IO空间的时候,GUEST OS启动默认使用serial_pci_driver,但是当修改程序,增加一些BAR空间后,发现GUEST启动后,两个vWDG设备就没有默认的驱动绑定了,也就不需要执行unbind的echo了。

5.运行测试用例,读写WDG PCI设备的BAR0地址空间:

此时可以看到,虚拟机中对WDG设备BAR0空间的读写调用被“透传"到了HOST机的MDEV PCI设备驱动上,可以基于对BAR0空间的回调实现我们的业务逻辑。

BAR空间映射:

经过改进的vWDG支持全部的BAR空间映射,这是通过虚拟化设备的配置空间得到的:

中断注入模拟

利用EVENTFD机制从MDEV中开始,通过KVM向GUEST 中的虚拟PCI DOG设备注入中断,下图显示GUEST OS 驱动成功接受到来自于HOST OS MDEV框架parent回调注入的中断。

step1: host os mdev vendor callback trigger interrupt,wake up irqfd_inject

step 2: irqfd worker start to run and inject interrupt to virtual iopic:

step 3:当虚拟机投入运行时,调用ARCH的.set_irq handler将中断请求写入VMCS,出发VCPU处理中断:

所以看起来eventfd唤醒的并非是POLL,而是irqfd_inject worker.

配置空间确认

配置空间是mdev回调OPS中模拟的,control信息表示支持IO和MMIO访问,不支持BUS MASTER。状态寄存器表示medium设备,并且不支持capabilities.这些和MDEV中给定的虚拟设备配置空间相符。

bar空间读写测试

虚拟机内的的BAR空间读写测试用例,当访问IO BAR时,虚拟机会陷入VMM,reason number为KVM_IO_EXIT,当访问MMIO BAR时,虚拟机同样也会陷入VMM,此时的reason number为 KVM_MMIO_EXIT.  QEMU处理接下来的IO/MMIO读写,以后者为例,对应QEMU中的调用调用堆栈现场为:

callstack:

#0  0x00005555558fb597 in vfio_region_write (opaque=0x555557bcacb8, addr=790560, data=197640, size=4) at /home/zlcao/Workspace/qemu/qemu-4.2.1/hw/vfio/common.c:183
#1  0x000055555587fd68 in memory_region_write_accessor (mr=0x555557bd3610, addr=790560, value=0x7fffdd050f38, size=4, shift=0, mask=4294967295, attrs=...)
    at /home/zlcao/Workspace/qemu/qemu-4.2.1/memory.c:483
#2  0x000055555587ff4f in access_with_adjusted_size (addr=790560, value=0x7fffdd050f38, size=4, access_size_min=1, access_size_max=8, access_fn=
    0x55555587fca8 <memory_region_write_accessor>, mr=0x555557bd3610, attrs=...) at /home/zlcao/Workspace/qemu/qemu-4.2.1/memory.c:544
#3  0x0000555555882ef9 in memory_region_dispatch_write (mr=0x555557bd3610, addr=790560, data=197640, op=MO_32, attrs=...) at /home/zlcao/Workspace/qemu/qemu-4.2.1/memory.c:1475
#4  0x0000555555821364 in flatview_write_continue (fv=0x7fffc4004b10, addr=4270592032, attrs=..., buf=0x7ffff7fec028 "\b\004\003", len=4, addr1=790560, l=4, mr=0x555557bd3610)
    at /home/zlcao/Workspace/qemu/qemu-4.2.1/exec.c:3129
#5  0x00005555558214a9 in flatview_write (fv=0x7fffc4004b10, addr=4270592032, attrs=..., buf=0x7ffff7fec028 "\b\004\003", len=4) at /home/zlcao/Workspace/qemu/qemu-4.2.1/exec.c:3169
#6  0x00005555558217f6 in address_space_write (as=0x5555567ffb60 <address_space_memory>, addr=4270592032, attrs=..., buf=0x7ffff7fec028 "\b\004\003", len=4)
    at /home/zlcao/Workspace/qemu/qemu-4.2.1/exec.c:3259
#7  0x0000555555821863 in address_space_rw (as=0x5555567ffb60 <address_space_memory>, addr=4270592032, attrs=..., buf=0x7ffff7fec028 "\b\004\003", len=4, is_write=true)
    at /home/zlcao/Workspace/qemu/qemu-4.2.1/exec.c:3269
#8  0x000055555589b6d4 in kvm_cpu_exec (cpu=0x555556d19b00) at /home/zlcao/Workspace/qemu/qemu-4.2.1/accel/kvm/kvm-all.c:2374
#9  0x0000555555870f64 in qemu_kvm_cpu_thread_fn (arg=0x555556d19b00) at /home/zlcao/Workspace/qemu/qemu-4.2.1/cpus.c:1318
#10 0x0000555555e0c0b4 in qemu_thread_start (args=0x555556b051e0) at util/qemu-thread-posix.c:519
#11 0x00007ffff365d6db in start_thread (arg=0x7fffdd054700) at pthread_create.c:463
#12 0x00007ffff338661f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

可以看到此时访问的0xfe8c1020地址正是虚拟WATCHDOGS设备的BAR1空间中的地址:

参考文档

中断处理_中断注入-CSDN博客


结束

;