利用周末时间做了一个MDEV虚拟化PCI设备的小试验,简单记录一下:
DEMO架构,此图参考了内核文档:Documentation/driver-api/vfio-mediated-device.rst
Demo 框架:参考如下文章中的受控直通方案,区别是由于实验中的watchdog是纯粹的模拟设备,包括BAR IO空间实际上对应的都是内存BUFFER,不需要PASS-TRHOUGH。
host kernel watchdog pci driver:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/uuid.h>
#include <linux/vfio.h>
#include <linux/iommu.h>
#include <linux/sysfs.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/mdev.h>
#include <linux/pci.h>
#define IO_BAR0_SIZE 32
#define IO_CONF_SIZE 0x100
#define CZL_WDG_DEVICE_VENDOR_ID 0xbeef
#define CZL_WDG_DEVICE_DEVICE_ID 0x1001
#define API_DBG(fmt, ...) do { \
printk("%s line %d, "fmt, __func__, __LINE__, ##__VA_ARGS__); \
} while (0)
struct czl_wdg_dev {
dev_t wdg_devt;
struct class *wdg_class;
struct cdev wdg_cdev;
struct device dev;
};
struct mdev_region_info {
u64 start;
u64 phys_start;
u32 size;
u64 vfio_offset;
};
struct wdg_mdev_state {
u8 *config;
u8 *iobase;
struct mdev_device *mdev;
struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];
u32 bar_mask[VFIO_PCI_NUM_REGIONS];
struct list_head next;
struct vfio_device_info dev_info;
int index;
struct mutex ops_lock;
};
static const struct file_operations czl_wdg_fops = {
.owner = THIS_MODULE,
};
static struct mutex wdg_mdev_list_lock;
static struct list_head wdg_mdev_devices_list;
#define WDG_VFIO_PCI_OFFSET_SHIFT (40)
#define WDG_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> WDG_VFIO_PCI_OFFSET_SHIFT)
#define WDG_VFIO_PCI_INDEX_TO_OFFSET(index) \
((u64)(index) << WDG_VFIO_PCI_OFFSET_SHIFT)
#define WDG_VFIO_PCI_OFFSET_MASK \
(((u64)(1) << WDG_VFIO_PCI_OFFSET_SHIFT) - 1)
#define MAX_WDGS (16)
static struct czl_wdg_dev czl_wdg;
static ssize_t
czl_wdg_dev_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "mdev emulated pci watchdog device by caozilong.\n");
}
static DEVICE_ATTR_RO(czl_wdg_dev);
static struct attribute *wdg_dev_attrs[] = {
&dev_attr_czl_wdg_dev.attr,
NULL,
};
static const struct attribute_group wdg_dev_group = {
.name = "czl_wdg",
.attrs = wdg_dev_attrs,
};
static const struct attribute_group *wdg_dev_groups[] = {
&wdg_dev_group,
NULL,
};
static ssize_t
mdev_dev_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
if (mdev_from_dev(dev)) {
return sprintf(buf, "This is watchdog %s\n", dev_name(dev));
}
return sprintf(buf, "\n");
}
static DEVICE_ATTR_RO(mdev_dev);
static struct attribute *mdev_dev_attrs[] = {
&dev_attr_mdev_dev.attr,
NULL,
};
static const struct attribute_group mdev_dev_group = {
.name = "caozilong",
.attrs = mdev_dev_attrs,
};
static const struct attribute_group *mdev_dev_groups[] = {
&mdev_dev_group,
NULL,
};
static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
{
int i;
char name[128];
const char *name_str[3] = {"Soft Watchdog", "Hardware Watchdog", "Dummy Watchdog"};
for (i = 0; i < 3; i++) {
snprintf(name, 128, "%s-%d", dev_driver_string(dev), i + 1);
if (!strcmp(kobj->name, name)) {
return sprintf(buf, "%s\n", name_str[i]);
}
}
return -EINVAL;
}
static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
char *buf)
{
return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
}
static ssize_t
available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
{
struct wdg_mdev_state *mds;
int used = 0;
list_for_each_entry(mds, &wdg_mdev_devices_list, next) {
used ++;
}
return sprintf(buf, "%d\n", (MAX_WDGS - used));
}
static MDEV_TYPE_ATTR_RO(name);
static MDEV_TYPE_ATTR_RO(device_api);
static MDEV_TYPE_ATTR_RO(available_instances);
static struct attribute *mdev_types_attrs[] = {
&mdev_type_attr_name.attr,
&mdev_type_attr_device_api.attr,
&mdev_type_attr_available_instances.attr,
NULL,
};
static struct attribute_group mdev_type_group1 = {
.name = "1",
.attrs = mdev_types_attrs,
};
static struct attribute_group mdev_type_group2 = {
.name = "2",
.attrs = mdev_types_attrs,
};
static struct attribute_group mdev_type_group3 = {
.name = "3",
.attrs = mdev_types_attrs,
};
static struct attribute_group *mdev_type_groups[] = {
&mdev_type_group1,
&mdev_type_group2,
&mdev_type_group3,
NULL,
};
static int czl_wdg_open(struct mdev_device *mdev)
{
pr_info("%s line %d, wdg device opened.\n",
__func__, __LINE__);
return 0;
}
static void czl_wdg_close(struct mdev_device *mdev)
{
pr_info("%s line %d, wdg device close.\n",
__func__, __LINE__);
return;
}
// fill pci config space meta data & capabilities.
int wdg_create_config_space(struct wdg_mdev_state *mstate)
{
// vendor id, device id.
*((unsigned int *)&mstate->config[0]) = CZL_WDG_DEVICE_VENDOR_ID |
(CZL_WDG_DEVICE_DEVICE_ID << 16);
*((unsigned short *)&mstate->config[4]) = 0x0001;
*((unsigned short *)&mstate->config[6]) = 0x0200;
mstate->config[0x8] = 0x10;
mstate->config[0x9] = 0x02;
mstate->config[0xa] = 0x00;
mstate->config[0xb] = 0x07;
*((unsigned int *)&mstate->config[0x10]) = 0x000001;
mstate->bar_mask[0] = ~(IO_BAR0_SIZE) + 1;
*((unsigned int *)&mstate->config[0x2c]) = 0x10011af4;
// cap ptr.
mstate->config[0x34] = 0x00;
mstate->config[0x3d] = 0x01;
mstate->config[0x40] = 0x23;
mstate->config[0x43] = 0x80;
mstate->config[0x44] = 0x23;
mstate->config[0x48] = 0x23;
mstate->config[0x4c] = 0x23;
mstate->config[0x60] = 0x50;
mstate->config[0x61] = 0x43;
mstate->config[0x62] = 0x49;
mstate->config[0x63] = 0x20;
mstate->config[0x64] = 0x53;
mstate->config[0x65] = 0x65;
mstate->config[0x66] = 0x72;
mstate->config[0x67] = 0x69;
mstate->config[0x68] = 0x61;
mstate->config[0x69] = 0x6c;
mstate->config[0x6a] = 0x2f;
mstate->config[0x6b] = 0x55;
mstate->config[0x6c] = 0x41;
mstate->config[0x6d] = 0x52;
mstate->config[0x6e] = 0x54;
return 0;
}
static int czl_wdg_create(struct kobject *kobj, struct mdev_device *mdev)
{
int i;
struct wdg_mdev_state *mstate;
char name[32];
if (!mdev)
return -EINVAL;
for (i = 0; i < 3; i++) {
snprintf(name, 32, "%s-%d", dev_driver_string(mdev_parent_dev(mdev)), i + 1);
if (!strcmp(kobj->name, name)) {
break;
}
}
if (i >= 3) {
return -EINVAL;
}
mstate = kzalloc(sizeof(struct wdg_mdev_state), GFP_KERNEL);
if (mstate == NULL)
return -ENOMEM;
// group number in mdev_type.
mstate->index = i + 1;
mstate->config = kzalloc(IO_CONF_SIZE, GFP_KERNEL);
if (mstate->config == NULL) {
pr_err("%s line %d, alloc pci config buffer failure.\n",
__func__, __LINE__);
kfree(mstate);
return -ENOMEM;
}
mstate->iobase = kzalloc(IO_BAR0_SIZE, GFP_KERNEL);
if (mstate->iobase == NULL) {
pr_err("%s line %d, alloc pci io buffer failure.\n",
__func__, __LINE__);
kfree(mstate->config);
kfree(mstate);
return -ENOMEM;
}
memset(mstate->config, 0x00, IO_CONF_SIZE);
mutex_init(&mstate->ops_lock);
mstate->mdev = mdev;
mdev_set_drvdata(mdev, mstate);
wdg_create_config_space(mstate);
mutex_lock(&wdg_mdev_list_lock);
list_add(&mstate->next, &wdg_mdev_devices_list);
mutex_unlock(&wdg_mdev_list_lock);
return 0;
}
static int czl_wdg_remove(struct mdev_device *mdev)
{
struct wdg_mdev_state *mds, *tmp_mds;
struct wdg_mdev_state *mstate = mdev_get_drvdata(mdev);
int ret = -EINVAL;
mutex_lock(&wdg_mdev_list_lock);
list_for_each_entry_safe(mds, tmp_mds, &wdg_mdev_devices_list, next) {
if (mstate == mds) {
list_del(&mstate->next);
mdev_set_drvdata(mdev, NULL);
kfree(mstate->config);
kfree(mstate->iobase);
kfree(mstate);
ret = 0;
break;
}
}
mutex_unlock(&wdg_mdev_list_lock);
return ret;
}
static void handle_pci_cfg_space_write(struct wdg_mdev_state *mstate, u16 offset,
u8 *buf, u32 count)
{
u32 cfg_addr, bar_mask;
switch (offset) {
case 0x04: /* device control */
case 0x06: /* device status */
// do nothing
break;
case 0x3c:
mstate->config[0x3c] = buf[0];
break;
case 0x3d:
break;
case 0x10: /* BAR0 */
cfg_addr = *(u32 *)buf;
pr_info("BAR0 addr 0x%x\n", cfg_addr);
if (cfg_addr == 0xffffffff) {
bar_mask = mstate->bar_mask[0];
cfg_addr = (cfg_addr & bar_mask);
}
cfg_addr |= (mstate->config[offset] & 0x3ul);
*((unsigned int *)&mstate->config[offset]) = cfg_addr;
break;
case 0x14: /* BAR1 */
case 0x18: /* BAR2 */
case 0x20: /* BAR4 */
*((unsigned int *)&mstate->config[offset]) = 0;
break;
default:
pr_info("PCI config write @0x%x of %d bytes not handled\n",
offset, count);
break;
}
return;
}
static void handle_pci_cfg_space_read(struct wdg_mdev_state *mstate, u16 offset,
u8 *buf, u32 count)
{
memcpy(buf, (mstate->config + offset), count);
return;
}
static void mdev_read_base(struct wdg_mdev_state *mstate)
{
int index, pos;
u32 start_lo, start_hi;
u32 mem_type;
pos = PCI_BASE_ADDRESS_0;
for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) {
if (!mstate->region_info[index].size)
continue;
start_lo = (*(u32 *)(mstate->config + pos)) &
PCI_BASE_ADDRESS_MEM_MASK;
mem_type = (*(u32 *)(mstate->config + pos)) &
PCI_BASE_ADDRESS_MEM_TYPE_MASK;
switch (mem_type) {
case PCI_BASE_ADDRESS_MEM_TYPE_64:
start_hi = (*(u32 *)(mstate->config + pos + 4));
pos += 4;
break;
case PCI_BASE_ADDRESS_MEM_TYPE_32:
case PCI_BASE_ADDRESS_MEM_TYPE_1M:
default:
start_hi = 0;
break;
}
pos += 4;
mstate->region_info[index].start = ((u64)start_hi << 32) | start_lo;
}
return;
}
static void handle_bar_write(unsigned int index, struct wdg_mdev_state *mstate,
u16 offset, u8 *buf, u32 count)
{
pr_info("%s line %d, bar %d, write offset 0x%x, count 0x%x, val 0x%x.\n",
__func__, __LINE__, index, offset, count, *buf);
memcpy(mstate->iobase + offset, buf, count);
return;
}
static void handle_bar_read(unsigned int index, struct wdg_mdev_state *mstate,
u16 offset, u8 *buf, u32 count)
{
pr_info("%s line %d, bar %d, read offset 0x%x, count 0x%x, val 0x%x.\n",
__func__, __LINE__, index, offset, count, *buf);
memcpy(buf, mstate->iobase + offset, count);
return;
}
static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
loff_t pos, bool is_write)
{
int ret = 0;
unsigned int index;
loff_t offset;
struct wdg_mdev_state *mstate;
if (!mdev || !buf)
return -EINVAL;
mstate = mdev_get_drvdata(mdev);
if (!mstate) {
pr_err("%s line %d. get mstate failure.\n", __func__, __LINE__);
return -EINVAL;
}
mutex_lock(&mstate->ops_lock);
index = WDG_VFIO_PCI_OFFSET_TO_INDEX(pos);
offset = pos & WDG_VFIO_PCI_OFFSET_MASK;
switch (index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
pr_info("%s: PCI config space %s at offset 0x%llx\n",
__func__, is_write ? "write" : "read", offset);
if (is_write) {
handle_pci_cfg_space_write(mstate, offset, buf, count);
} else {
handle_pci_cfg_space_read(mstate, offset, buf, count);
}
break;
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
if (!mstate->region_info[index].start)
mdev_read_base(mstate);
if (is_write) {
pr_info("%s: write bar%d offset 0x%llx, val 0x%x.\n",
__func__, index, offset, *buf);
handle_bar_write(index, mstate, offset, buf, count);
} else {
pr_info("%s: read bar%d offset 0x%llx, val 0x%x.\n",
__func__, index, offset, *buf);
handle_bar_read(index, mstate, offset, buf, count);
}
break;
default:
ret = -1;
goto failed;
}
ret = count;
failed:
mutex_unlock(&mstate->ops_lock);
return ret;
}
static ssize_t czl_wdg_read(struct mdev_device *mdev, char __user *buf,
size_t count, loff_t *ppos)
{
unsigned int done = 0;
int ret;
pr_info("%s line %d, read count 0x%lx, pos 0x%llx.\n", __func__, __LINE__, count, *ppos);
while (count) {
size_t filled;
if (count >= 4 && !(*ppos % 4)) {
u32 val;
ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
*ppos, false);
if (ret <= 0)
goto read_err;
if (copy_to_user(buf, &val, sizeof(val)))
goto read_err;
filled = 4;
} else if (count >= 2 && !(*ppos % 2)) {
u16 val;
ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
*ppos, false);
if (ret <= 0)
goto read_err;
if (copy_to_user(buf, &val, sizeof(val)))
goto read_err;
filled = 2;
} else {
u8 val;
ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
*ppos, false);
if (ret <= 0)
goto read_err;
if (copy_to_user(buf, &val, sizeof(val)))
goto read_err;
filled = 1;
}
count -= filled;
done += filled;
*ppos += filled;
buf += filled;
}
pr_info("%s line %d, read count 0x%x.\n", __func__, __LINE__, done);
return done;
read_err:
pr_err("%s line %d, read err happend.\n", __func__, __LINE__);
return -EFAULT;
}
static ssize_t czl_wdg_write(struct mdev_device *mdev, const char __user *buf,
size_t count, loff_t *ppos)
{
unsigned int done = 0;
int ret;
pr_info("%s line %d, write count 0x%lx, pos 0x%llx.\n", __func__, __LINE__, count, *ppos);
while (count) {
size_t filled;
if (count >= 4 && !(*ppos % 4)) {
u32 val;
if (copy_from_user(&val, buf, sizeof(val)))
goto write_err;
ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
*ppos, true);
if (ret <= 0)
goto write_err;
filled = 4;
} else if (count >= 2 && !(*ppos % 2)) {
u16 val;
if (copy_from_user(&val, buf, sizeof(val)))
goto write_err;
ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
*ppos, true);
if (ret <= 0)
goto write_err;
filled = 2;
} else {
u8 val;
if (copy_from_user(&val, buf, sizeof(val)))
goto write_err;
ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
*ppos, true);
if (ret <= 0)
goto write_err;
filled = 1;
}
count -= filled;
done += filled;
*ppos += filled;
buf += filled;
}
pr_info("%s line %d, write count 0x%x.\n", __func__, __LINE__, done);
return done;
write_err:
pr_err("%s line %d, write failure.\n", __func__, __LINE__);
return -EFAULT;
}
static int wdg_get_device_info(struct mdev_device *mdev, struct vfio_device_info *dev_info)
{
dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
return 0;
}
static int wdg_get_region_info(struct mdev_device *mdev, struct vfio_region_info *region_info)
{
unsigned int size = 0;
struct wdg_mdev_state *mstate;
u32 bar_index;
if (!mdev) {
pr_err("%s line %d,mdev is null.\n", __func__, __LINE__);
return -EINVAL;
}
mstate = mdev_get_drvdata(mdev);
if (!mstate) {
pr_err("%s line %d,mstat is null.\n", __func__, __LINE__);
return -EINVAL;
}
bar_index = region_info->index;
if (bar_index >= VFIO_PCI_NUM_REGIONS) {
pr_err("%s line %d,bar index %d exceeds.\n", __func__, __LINE__, bar_index);
return -EINVAL;
}
mutex_lock(&mstate->ops_lock);
switch (bar_index) {
case VFIO_PCI_CONFIG_REGION_INDEX:
size = IO_CONF_SIZE;
break;
case VFIO_PCI_BAR0_REGION_INDEX:
size = IO_BAR0_SIZE;
break;
default:
size = 0;
break;
}
mstate->region_info[bar_index].size = size;
mstate->region_info[bar_index].vfio_offset =
WDG_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
region_info->size = size;
region_info->offset = WDG_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
region_info->flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE;
mutex_unlock(&mstate->ops_lock);
return 0;
}
static int wdg_get_irq_info(struct mdev_device *mdev, struct vfio_irq_info *irq_info)
{
switch (irq_info->index) {
case VFIO_PCI_INTX_IRQ_INDEX:
case VFIO_PCI_MSI_IRQ_INDEX:
case VFIO_PCI_REQ_IRQ_INDEX:
break;
default:
pr_err("%s line %d, irq idx %d is invalid.\n",
__func__, __LINE__, irq_info->index);
return -EINVAL;
}
irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
irq_info->count = 1;
if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)
irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE |
VFIO_IRQ_INFO_AUTOMASKED);
else
irq_info->flags |= VFIO_IRQ_INFO_NORESIZE;
return 0;
}
static long czl_wdg_ioctl(struct mdev_device *mdev, unsigned int cmd,
unsigned long arg)
{
int ret = 0;
unsigned long minsz;
struct wdg_mdev_state *mstate;
pr_info("czl wdg ioctl enter.\n");
if (!mdev) {
pr_err("%s line %d, mdev is null.\n", __func__, __LINE__);
return -EINVAL;
}
mstate = mdev_get_drvdata(mdev);
if (!mstate) {
pr_err("%s line %d, cant find mstate data.\n", __func__, __LINE__);
return -ENODEV;
}
switch (cmd) {
case VFIO_DEVICE_GET_INFO: {
struct vfio_device_info info;
minsz = offsetofend(struct vfio_device_info, num_irqs);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz) {
pr_err("%s line %d, info.argsz %d < minsz %ld.\n",
__func__, __LINE__, info.argsz, minsz);
return -EINVAL;
}
ret = wdg_get_device_info(mdev, &info);
if (ret) {
pr_err("%s line %d, get device info failure.\n", __func__, __LINE__);
return ret;
}
memcpy(&mstate->dev_info, &info, sizeof(info));
if (copy_to_user((void __user *)arg, &info, minsz))
return -EFAULT;
return 0;
}
case VFIO_DEVICE_GET_REGION_INFO: {
struct vfio_region_info info;
minsz = offsetofend(struct vfio_region_info, offset);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz) {
pr_err("%s line %d, info.argsz %d < minsz %ld.\n",
__func__, __LINE__, info.argsz, minsz);
return -EINVAL;
}
ret = wdg_get_region_info(mdev, &info);
if (ret) {
pr_err("%s line %d, get region info failure.\n", __func__, __LINE__);
return ret;
}
if (copy_to_user((void __user *)arg, &info, minsz))
return -EFAULT;
return 0;
}
case VFIO_DEVICE_GET_IRQ_INFO: {
struct vfio_irq_info info;
minsz = offsetofend(struct vfio_irq_info, count);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if ((info.argsz < minsz) ||
(info.index >= mstate->dev_info.num_irqs))
return -EINVAL;
ret = wdg_get_irq_info(mdev, &info);
if (ret)
return ret;
if (copy_to_user((void __user *)arg, &info, minsz))
return -EFAULT;
return 0;
}
case VFIO_DEVICE_SET_IRQS: {
pr_info("%s line %d, set irqs.\n", __func__, __LINE__);
return 0;
}
case VFIO_DEVICE_RESET:
pr_info("%s line %d, reset.\n", __func__, __LINE__);
return 0;
}
return -EINVAL;
}
static const struct mdev_parent_ops wdg_mdev_fops = {
.owner = THIS_MODULE,
.dev_attr_groups = wdg_dev_groups,
.mdev_attr_groups = mdev_dev_groups,
.supported_type_groups = mdev_type_groups,
.create = czl_wdg_create,
.remove = czl_wdg_remove,
.open = czl_wdg_open,
.release = czl_wdg_close,
.read = czl_wdg_read,
.write = czl_wdg_write,
.ioctl = czl_wdg_ioctl,
};
static void wdg_device_release(struct device *dev)
{
pr_info("czl wdg devide release.\n");
}
static int mdev_wdg_init(void)
{
int ret = 0;
pr_info("czl wdg init.\n");
memset(&czl_wdg, 0x00, sizeof(czl_wdg));
ret = alloc_chrdev_region(&czl_wdg.wdg_devt, 0, MINORMASK + 1, "czl_wdg");
if (ret < 0) {
pr_err("error: failed to register czl wdg device, err:%d\n", ret);
return -1;
}
cdev_init(&czl_wdg.wdg_cdev, &czl_wdg_fops);
cdev_add(&czl_wdg.wdg_cdev, czl_wdg.wdg_devt, MINORMASK + 1);
pr_info("major_number:%d\n", MAJOR(czl_wdg.wdg_devt));
czl_wdg.wdg_class = class_create(THIS_MODULE, "czl_wdg");
if (IS_ERR(czl_wdg.wdg_class)) {
pr_err("error: failed to create wdg class.\n");
ret = -1;
goto failed1;
}
czl_wdg.dev.class = czl_wdg.wdg_class;
czl_wdg.dev.release = wdg_device_release;
dev_set_name(&czl_wdg.dev, "%s", "czl_wdg");
ret = device_register(&czl_wdg.dev);
if (ret) {
pr_err("%s line %d, register wdg device failure.\n", __func__, __LINE__);
ret = -1;
goto failed2;
}
ret = mdev_register_device(&czl_wdg.dev, &wdg_mdev_fops);
if (ret) {
pr_err("%s line %d, register wdg mdev device failure.\n", __func__, __LINE__);
ret = -1;
goto failed3;
}
mutex_init(&wdg_mdev_list_lock);
INIT_LIST_HEAD(&wdg_mdev_devices_list);
pr_info("czl wdg init success.\n");
goto done;
failed3:
device_unregister(&czl_wdg.dev);
failed2:
class_destroy(czl_wdg.wdg_class);
failed1:
cdev_del(&czl_wdg.wdg_cdev);
unregister_chrdev_region(czl_wdg.wdg_devt, MINORMASK + 1);
done:
return ret;
}
static void mdev_wdg_exit(void)
{
czl_wdg.dev.bus = NULL;
mdev_unregister_device(&czl_wdg.dev);
device_unregister(&czl_wdg.dev);
cdev_del(&czl_wdg.wdg_cdev);
unregister_chrdev_region(czl_wdg.wdg_devt, MINORMASK + 1);
class_destroy(czl_wdg.wdg_class);
czl_wdg.wdg_class = NULL;
pr_info("czl_wdg_unload.\n");
return;
}
module_init(mdev_wdg_init)
module_exit(mdev_wdg_exit)
MODULE_LICENSE("GPL v2");
qemu virtual machine kernel watchdog pci driver
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/cdev.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/uuid.h>
#include <linux/vfio.h>
#include <linux/iommu.h>
#include <linux/sysfs.h>
#include <linux/ctype.h>
#include <linux/file.h>
#include <linux/mdev.h>
#include <linux/pci.h>
#include <linux/idr.h>
static int devno;
static DEFINE_IDR(wdg_minors);
static DEFINE_MUTEX(wdg_minors_lock);
#define WDG_MINORS_COUNT 256
struct wdg_pci_state {
struct pci_dev *pdev;
struct device *dev;
int iobase;
int iolen;
int major;
int minor;
};
static struct class *wdg_class;
static const struct pci_device_id czl_pci_table[] = {
{ PCI_DEVICE(0xbeef, 0x1001), },
{ 0, }
};
static int czl_wdg_open(struct inode *inode, struct file *file)
{
int rc = 0;
int major, minor;
major = imajor(inode);
minor = iminor(inode);
mutex_lock(&wdg_minors_lock);
file->private_data = idr_find(&wdg_minors, minor);
mutex_unlock(&wdg_minors_lock);
if (!file->private_data) {
pr_err("%s line %d, cant find wdg structure.\n",
__func__, __LINE__);
rc = -1;
}
return rc;
}
static int czl_wdg_release(struct inode *inode, struct file *file)
{
return 0;
}
ssize_t czl_wdg_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
int i;
struct wdg_pci_state *wdgdev = NULL;
unsigned char *kbuf = NULL;
int actuallen = 0;
wdgdev = file->private_data;
if (!wdgdev) {
pr_err("%s line %d, read failure.\n", __func__, __LINE__);
return -1;
}
if (*ppos > wdgdev->iolen) {
pr_err("%s line %d, read pos %lld exceed max io len %d.\n",
__func__, __LINE__, *ppos, wdgdev->iolen);
return -1;
}
kbuf = kzalloc(GFP_KERNEL, size);
if (kbuf == NULL) {
pr_err("%s line %d, alloc kbuf failure.\n",
__func__, __LINE__);
return -1;
}
for (i = 0; (i < size) && ((*ppos + i) < wdgdev->iolen); i++) {
kbuf[i] = inb(wdgdev->iobase + *ppos + i);
actuallen ++;
}
copy_to_user(buf, kbuf, actuallen);
kfree(kbuf);
return actuallen;
}
static ssize_t czl_wdg_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
int i;
struct wdg_pci_state *wdgdev = NULL;
unsigned char *kbuf = NULL;
int actuallen = 0;
wdgdev = file->private_data;
if (!wdgdev) {
pr_err("%s line %d, read failure.\n", __func__, __LINE__);
return -1;
}
if (*ppos > wdgdev->iolen) {
pr_err("%s line %d, read pos %lld exceed max io len %d.\n",
__func__, __LINE__, *ppos, wdgdev->iolen);
return -1;
}
kbuf = kzalloc(GFP_KERNEL, count);
if (kbuf == NULL) {
pr_err("%s line %d, alloc kbuf failure.\n",
__func__, __LINE__);
return -1;
}
copy_from_user(kbuf, buf, count);
for (i = 0; (i < count) && ((*ppos + i) < wdgdev->iolen); i++) {
outb((u8)kbuf[i], wdgdev->iobase + *ppos + i);
actuallen ++;
}
kfree(kbuf);
return actuallen;
}
static const struct file_operations czl_wdg_fops = {
.owner = THIS_MODULE,
.open = czl_wdg_open,
.release = czl_wdg_release,
.read = czl_wdg_read,
.write = czl_wdg_write,
};
static char *wdg_devnode(struct device *dev, umode_t *mode)
{
if (mode)
*mode = 06666;
return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
}
static int wdg_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct wdg_pci_state *wdgdev = NULL;
pr_info("%s line %d, wdg pci device & driver binding.\n", __func__, __LINE__);
wdgdev = kzalloc(GFP_KERNEL, sizeof(*wdgdev));
if (!wdgdev) {
pr_err("%s line %d, fail to alloc buffer.\n",
__func__, __LINE__);
goto err0;
}
wdgdev->major = devno;
wdgdev->pdev = pci_dev_get(pdev);
wdgdev->iobase = pci_resource_start(pdev, 0);
wdgdev->iolen = pci_resource_len(pdev, 0);
mutex_lock(&wdg_minors_lock);
wdgdev->minor = idr_alloc(&wdg_minors, wdgdev, 0, WDG_MINORS_COUNT, GFP_KERNEL);
mutex_unlock(&wdg_minors_lock);
if (wdgdev->minor < 0) {
pr_err("%s line %d, get minor failure from idr.\n", __func__, __LINE__);
goto err1;
}
pr_info("%s line %d, major %d, minor %d, iobase 0x%x.\n", __func__, __LINE__,
devno, wdgdev->minor, wdgdev->iobase);
wdgdev->dev = device_create(wdg_class, NULL, MKDEV(devno, wdgdev->minor),
NULL, "czl-wdg-%d", wdgdev->minor);
if (!wdgdev->dev || IS_ERR(wdgdev->dev)) {
pr_err("%s line %d, create wdg device failure.\n",
__func__, __LINE__);
goto err2;
}
pci_set_drvdata(pdev, wdgdev);
return 0;
err2:
idr_remove(&wdg_minors, wdgdev->minor);
err1:
if (wdgdev) {
kfree(wdgdev);
}
err0:
return -1;
}
static void wdg_pci_remove(struct pci_dev *pdev)
{
struct wdg_pci_state *wdgdev;
pr_info("%s line %d, wdg pci device & driver removing.\n", __func__, __LINE__);
wdgdev = pci_get_drvdata(pdev);
pci_set_drvdata(pdev, NULL);
pci_dev_put(pdev);
wdgdev->pdev = NULL;
device_destroy(wdg_class, MKDEV(devno, wdgdev->minor));
idr_remove(&wdg_minors, wdgdev->minor);
kfree(wdgdev);
return;
}
static struct pci_driver czl_wdg_driver = {
.name = "czl-mdev-wdg",
.id_table = czl_pci_table,
.probe = wdg_pci_probe,
.remove = wdg_pci_remove,
};
static int czl_wdg_init(void)
{
int ret;
wdg_class = class_create(THIS_MODULE, "czl-wdg");
if (!wdg_class) {
pr_err("%s line %d, create watchdog class failure.\n",
__func__, __LINE__);
return -1;
}
wdg_class->devnode = wdg_devnode;
devno = register_chrdev(0, "czl-wdg", &czl_wdg_fops);
if (devno < 0) {
pr_err("%s line %d, register wdg device chrno failure.\n",
__func__, __LINE__);
class_destroy(wdg_class);
return -1;
}
ret = pci_register_driver(&czl_wdg_driver);
return ret;
}
static void czl_wdg_exit(void)
{
pci_unregister_driver(&czl_wdg_driver);
unregister_chrdev(devno, "czl-wdg");
class_destroy(wdg_class);
idr_destroy(&wdg_minors);
return;
}
module_init(czl_wdg_init)
module_exit(czl_wdg_exit)
MODULE_LICENSE("GPL v2");
qemu virtual machine user space wdt test case
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdarg.h>
void dump_buf(unsigned char *buf, int len)
{
int i;
for (i = 0; i < len; i++) {
if (i % 16 == 0)
printf("\n0x%04x: ", i);
printf("0x%02x ", buf[i]);
}
printf("\n");
return;
}
int main(void)
{
int wdgfd;
int status;
unsigned char buf[32];
wdgfd = open("/dev/czl-wdg-0", O_RDWR);
if (wdgfd < 0) {
printf("%s line %d, open failure.\n",
__func__, __LINE__);
return -1;
}
while (1) {
memset(buf, 0x00, 32);
status = read(wdgfd, buf, 32);
if (status < 0) {
printf("%s line %d, read failure.\n",
__func__, __LINE__);
return -1;
}
printf("%s line %d, read %d.\n", __func__, __LINE__, status);
dump_buf(buf, 32);
memset(buf, 0x5a, 32);
lseek(wdgfd, 0, SEEK_SET);
status = write(wdgfd, buf, 32);
if (status < 0) {
printf("%s line %d, read failure.\n",
__func__, __LINE__);
return -1;
}
printf("%s line %d, read %d.\n", __func__, __LINE__, status);
sleep(1);
}
close(wdgfd);
return 0;
}
测试程序构成:
测试过程:
1.安装WDG MDEV驱动:
sudo insmod mdev.ko
sudo insmod vfio_mdev.ko
sudo insmod czl-mdev-wdg.ko
测试主机默认内核的VFIO驱动是BUILT IN到KERNEL中的,如果重新编译内核,并且相关驱动编译为模块,可以看到MDEV实际上是基于VFIO驱动的:
host端的模块依赖关系可以用下图表示:
2.创建mdev设备
创建两个mdev设备
echo "f422fd86-35c0-11ef-8e50-9342c1138a56" > /sys/devices/virtual/czl_wdg/czl_wdg/mdev_supported_types/czl_wdg-1/create
echo "c04de378-35d8-11ef-95c3-339660dfc874" > /sys/devices/virtual/czl_wdg/czl_wdg/mdev_supported_types/czl_wdg-2/create
3.将第二步创建的mdev设别透传给QEMU虚拟机启动:
qemu-system-x86_64 -m 4096 -smp 4 --enable-kvm -drive file=/home/zlcao/Workspace/iso/ps.img -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/f422fd86-35c0-11ef-8e50-9342c1138a56 -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/c04de378-35d8-11ef-95c3-339660dfc874
系统启动后,可以看到虚拟机环境下出现了透传的MDEV PCI设备,设备vendor/device id为0xbeef1001,符合代码设定。
4.虚拟机内安装WDG PCI设备驱动:
上图中可以看到,两个透传的MDEV设备已经和一个名为"serial"的PCI设备驱动绑定,这并不符合预期,需要将默认的"serial"驱动和MDEV设备解绑,在QEMU虚拟机控制台中输入如下命令解绑驱动:
echo -n 0000:00:04.0 > /sys/bus/pci/drivers/serial/unbind
echo -n 0000:00:05.0 > /sys/bus/pci/drivers/serial/unbind
之后就可以安装我们的WDG PCI驱动了:
sudo insmod czl-mdev-drv.ko
安装成功后,虚拟机设备目录下出现了WDG PCI的设备节点:
此时,两个MDEV PCI设备也显示绑定到了正确的驱动:
为何创建的PCI设备绑定了"serial" pci driver?
serial_pci_driver定义在内核文件linux-5.4.260/drivers/tty/serial/8250/8250_pci.c中
PCI设备驱动探测的时候绑定规则是检测设备的VENDROR ID和DEVICE ID是否和驱动给定的过滤器匹配,巧的是,serial_pci_driver驱动的serial_pci_tbl过滤器包含的一个映射规则是PCI_ANY_ID,也就是说,serial_pci_driver可以和任意的PCI设备绑定,所以才会出现创建的vWDG设备和serial驱动绑定的情况,虽然绑定了驱动,但是由于vWDG是我们自定义的设备,默认的serial是无法驱动的,所以必须卸载。
并且发现一个很有意思的现象,当只有BAR0 IO空间的时候,GUEST OS启动默认使用serial_pci_driver,但是当修改程序,增加一些BAR空间后,发现GUEST启动后,两个vWDG设备就没有默认的驱动绑定了,也就不需要执行unbind的echo了。
5.运行测试用例,读写WDG PCI设备的BAR0地址空间:
此时可以看到,虚拟机中对WDG设备BAR0空间的读写调用被“透传"到了HOST机的MDEV PCI设备驱动上,可以基于对BAR0空间的回调实现我们的业务逻辑。
BAR空间映射:
经过改进的vWDG支持全部的BAR空间映射,这是通过虚拟化设备的配置空间得到的:
中断注入模拟
利用EVENTFD机制从MDEV中开始,通过KVM向GUEST 中的虚拟PCI DOG设备注入中断,下图显示GUEST OS 驱动成功接受到来自于HOST OS MDEV框架parent回调注入的中断。
step1: host os mdev vendor callback trigger interrupt,wake up irqfd_inject
step 2: irqfd worker start to run and inject interrupt to virtual iopic:
step 3:当虚拟机投入运行时,调用ARCH的.set_irq handler将中断请求写入VMCS,出发VCPU处理中断:
所以看起来eventfd唤醒的并非是POLL,而是irqfd_inject worker.
配置空间确认
配置空间是mdev回调OPS中模拟的,control信息表示支持IO和MMIO访问,不支持BUS MASTER。状态寄存器表示medium设备,并且不支持capabilities.这些和MDEV中给定的虚拟设备配置空间相符。
bar空间读写测试
虚拟机内的的BAR空间读写测试用例,当访问IO BAR时,虚拟机会陷入VMM,reason number为KVM_IO_EXIT,当访问MMIO BAR时,虚拟机同样也会陷入VMM,此时的reason number为 KVM_MMIO_EXIT. QEMU处理接下来的IO/MMIO读写,以后者为例,对应QEMU中的调用调用堆栈现场为:
callstack:
#0 0x00005555558fb597 in vfio_region_write (opaque=0x555557bcacb8, addr=790560, data=197640, size=4) at /home/zlcao/Workspace/qemu/qemu-4.2.1/hw/vfio/common.c:183
#1 0x000055555587fd68 in memory_region_write_accessor (mr=0x555557bd3610, addr=790560, value=0x7fffdd050f38, size=4, shift=0, mask=4294967295, attrs=...)
at /home/zlcao/Workspace/qemu/qemu-4.2.1/memory.c:483
#2 0x000055555587ff4f in access_with_adjusted_size (addr=790560, value=0x7fffdd050f38, size=4, access_size_min=1, access_size_max=8, access_fn=
0x55555587fca8 <memory_region_write_accessor>, mr=0x555557bd3610, attrs=...) at /home/zlcao/Workspace/qemu/qemu-4.2.1/memory.c:544
#3 0x0000555555882ef9 in memory_region_dispatch_write (mr=0x555557bd3610, addr=790560, data=197640, op=MO_32, attrs=...) at /home/zlcao/Workspace/qemu/qemu-4.2.1/memory.c:1475
#4 0x0000555555821364 in flatview_write_continue (fv=0x7fffc4004b10, addr=4270592032, attrs=..., buf=0x7ffff7fec028 "\b\004\003", len=4, addr1=790560, l=4, mr=0x555557bd3610)
at /home/zlcao/Workspace/qemu/qemu-4.2.1/exec.c:3129
#5 0x00005555558214a9 in flatview_write (fv=0x7fffc4004b10, addr=4270592032, attrs=..., buf=0x7ffff7fec028 "\b\004\003", len=4) at /home/zlcao/Workspace/qemu/qemu-4.2.1/exec.c:3169
#6 0x00005555558217f6 in address_space_write (as=0x5555567ffb60 <address_space_memory>, addr=4270592032, attrs=..., buf=0x7ffff7fec028 "\b\004\003", len=4)
at /home/zlcao/Workspace/qemu/qemu-4.2.1/exec.c:3259
#7 0x0000555555821863 in address_space_rw (as=0x5555567ffb60 <address_space_memory>, addr=4270592032, attrs=..., buf=0x7ffff7fec028 "\b\004\003", len=4, is_write=true)
at /home/zlcao/Workspace/qemu/qemu-4.2.1/exec.c:3269
#8 0x000055555589b6d4 in kvm_cpu_exec (cpu=0x555556d19b00) at /home/zlcao/Workspace/qemu/qemu-4.2.1/accel/kvm/kvm-all.c:2374
#9 0x0000555555870f64 in qemu_kvm_cpu_thread_fn (arg=0x555556d19b00) at /home/zlcao/Workspace/qemu/qemu-4.2.1/cpus.c:1318
#10 0x0000555555e0c0b4 in qemu_thread_start (args=0x555556b051e0) at util/qemu-thread-posix.c:519
#11 0x00007ffff365d6db in start_thread (arg=0x7fffdd054700) at pthread_create.c:463
#12 0x00007ffff338661f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
可以看到此时访问的0xfe8c1020地址正是虚拟WATCHDOGS设备的BAR1空间中的地址: