每一个驱动程序(不仅仅只是网卡驱动)会使用 module_init 向内核注册一个初始化函数,当驱动被加载时,内核会调用这个函数。比如igb网卡驱动的代码位于drivers/net/ethernet/intel/igb/igb_main.c
//file: drivers/net/ethernet/intel/igb/igb_main.c static struct pci_driver igb_driver = { .name = igb_driver_name, .id_table = igb_pci_tbl, // 根据此ID匹配,来绑定驱动 .probe = igb_probe, // 重点函数,硬件的初始化函数都在这里做 .remove = igb_remove, ...... }; staticint __init igb_init_module(void){ ...... ret = pci_register_driver(&igb_driver); return ret; }
驱动的pci_register_driver调用完成后,Linux内核就知道了该驱动的相关信息,比如igb网卡驱动的igb_driver_name和igb_probe函数地址等等。当网卡设备被识别以后,内核会调用其驱动的probe方法(igb_driver的probe方法是igb_probe)。驱动probe方法执行的目的就是让设备ready,对于igb网卡,其igb_probe位于drivers/net/ethernet/intel/igb/igb_main.c下。主要执行的操作如下:
/** * igb_probe - Device Initialization Routine * @pdev: PCI device information struct * @ent: entry in igb_pci_tbl * * Returns 0 on success, negative on failure * * igb_probe initializes an adapter identified by a pci_dev structure. * The OS initialization, configuring of the adapter private structure, * and a hardware reset occur. **/ // 当insmod 驱动ko 或 有pci设备枚举上来时,通过ID进行匹配和绑定驱动,即会调用此函数,。 //此过程是由pci总线来调用来。 static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct net_device *netdev; struct igb_adapter *adapter; struct e1000_hw *hw; u16 eeprom_data = 0; s32 ret_val; static int global_quad_port_a; /* global quad port a indication */ const struct e1000_info *ei = igb_info_tbl[ent->driver_data]; int err, pci_using_dac; u8 part_str[E1000_PBANUM_LENGTH]; /* Catch broken hardware that put the wrong VF device ID in * the PCIe SR-IOV capability. */ if (pdev->is_virtfn) { WARN(1, KERN_ERR "%s (%hx:%hx) should not be a VF!\n", pci_name(pdev), pdev->vendor, pdev->device); return -EINVAL; } err = pci_enable_device_mem(pdev); if (err) return err; pci_using_dac = 0; err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (!err) { pci_using_dac = 1; } else { err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "No usable DMA configuration, aborting\n"); goto err_dma; } } err = pci_request_selected_regions(pdev, pci_select_bars(pdev, IORESOURCE_MEM), igb_driver_name); if (err) goto err_pci_reg; pci_enable_pcie_error_reporting(pdev); pci_set_master(pdev); pci_save_state(pdev); err = -ENOMEM; netdev = alloc_etherdev_mq(sizeof(struct igb_adapter), //申请网络设备,此参数包括了申请priv成员的大小 IGB_MAX_TX_QUEUES); if (!netdev) goto err_alloc_etherdev; SET_NETDEV_DEV(netdev, &pdev->dev); pci_set_drvdata(pdev, netdev); adapter = netdev_priv(netdev); adapter->netdev = netdev; adapter->pdev = pdev; hw = &adapter->hw; hw->back = adapter; adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); err = -EIO; adapter->io_addr = pci_iomap(pdev, 0, 0); if (!adapter->io_addr) goto err_ioremap; /* hw->hw_addr can be altered, we'll use adapter->io_addr for unmap */ hw->hw_addr = adapter->io_addr; netdev->netdev_ops = &igb_netdev_ops; // 设备ops操作函数 igb_set_ethtool_ops(netdev); // 注册ethtool操作,即ethtool_ops netdev->watchdog_timeo = 5 * HZ; strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1); netdev->mem_start = pci_resource_start(pdev, 0); netdev->mem_end = pci_resource_end(pdev, 0); /* PCI config space info */ hw->vendor_id = pdev->vendor; hw->device_id = pdev->device; hw->revision_id = pdev->revision; hw->subsystem_vendor_id = pdev->subsystem_vendor; hw->subsystem_device_id = pdev->subsystem_device; /* Copy the default MAC, PHY and NVM function pointers */ memcpy(&hw->mac.ops, ei->mac_ops, sizeof(hw->mac.ops)); memcpy(&hw->phy.ops, ei->phy_ops, sizeof(hw->phy.ops)); memcpy(&hw->nvm.ops, ei->nvm_ops, sizeof(hw->nvm.ops)); /* Initialize skew-specific constants */ err = ei->get_invariants(hw); if (err) goto err_sw_init; /* setup the private structure */ err = igb_sw_init(adapter); if (err) goto err_sw_init; igb_get_bus_info_pcie(hw); //获取设备的硬件信息,从后面处理来看,这个设备特性还挺多。 hw->phy.autoneg_wait_to_complete = false; /* Copper options */ if (hw->phy.media_type == e1000_media_type_copper) { hw->phy.mdix = AUTO_ALL_MODES; hw->phy.disable_polarity_correction = false; hw->phy.ms_type = e1000_ms_hw_default; } if (igb_check_reset_block(hw)) dev_info(&pdev->dev, "PHY reset is blocked due to SOL/IDER session.\n"); /* features is initialized to 0 in allocation, it might have bits * set by igb_sw_init so we should use an or instead of an * assignment. */ netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_RXHASH | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX; /* copy netdev features into list of user selectable features */ netdev->hw_features |= netdev->features; netdev->hw_features |= NETIF_F_RXALL; /* set this bit last since it cannot be part of hw_features */ netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; netdev->vlan_features |= NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG; netdev->priv_flags |= IFF_SUPP_NOFCS; if (pci_using_dac) { netdev->features |= NETIF_F_HIGHDMA; netdev->vlan_features |= NETIF_F_HIGHDMA; } if (hw->mac.type >= e1000_82576) { netdev->hw_features |= NETIF_F_SCTP_CSUM; netdev->features |= NETIF_F_SCTP_CSUM; } netdev->priv_flags |= IFF_UNICAST_FLT; adapter->en_mng_pt = igb_enable_mng_pass_thru(hw); /* before reading the NVM, reset the controller to put the device in a * known good starting state */ hw->mac.ops.reset_hw(hw); /* make sure the NVM is good , i211/i210 parts can have special NVM * that doesn't contain a checksum */ switch (hw->mac.type) { case e1000_i210: case e1000_i211: if (igb_get_flash_presence_i210(hw)) { if (hw->nvm.ops.validate(hw) < 0) { dev_err(&pdev->dev, "The NVM Checksum Is Not Valid\n"); err = -EIO; goto err_eeprom; } } break; default: if (hw->nvm.ops.validate(hw) < 0) { dev_err(&pdev->dev, "The NVM Checksum Is Not Valid\n"); err = -EIO; goto err_eeprom; } break; } /* copy the MAC address out of the NVM */ if (hw->mac.ops.read_mac_addr(hw)) dev_err(&pdev->dev, "NVM Read Error\n"); memcpy(netdev->dev_addr, hw->mac.addr, netdev->addr_len); if (!is_valid_ether_addr(netdev->dev_addr)) { dev_err(&pdev->dev, "Invalid MAC Address\n"); err = -EIO; goto err_eeprom; } /* get firmware version for ethtool -i */ igb_set_fw_version(adapter); /* configure RXPBSIZE and TXPBSIZE */ if (hw->mac.type == e1000_i210) { wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT); wr32(E1000_TXPBS, I210_TXPBSIZE_DEFAULT); } setup_timer(&adapter->watchdog_timer, igb_watchdog, (unsigned long) adapter); setup_timer(&adapter->phy_info_timer, igb_update_phy_info, (unsigned long) adapter); //设备驱动的功能,在异常时进行恢复 INIT_WORK(&adapter->reset_task, igb_reset_task); INIT_WORK(&adapter->watchdog_task, igb_watchdog_task); /* Initialize link properties that are user-changeable */ adapter->fc_autoneg = true; hw->mac.autoneg = true; hw->phy.autoneg_advertised = 0x2f; hw->fc.requested_mode = e1000_fc_default; hw->fc.current_mode = e1000_fc_default; igb_validate_mdi_setting(hw); /* By default, support wake on port A */ if (hw->bus.func == 0) adapter->flags |= IGB_FLAG_WOL_SUPPORTED; /* Check the NVM for wake support on non-port A ports */ if (hw->mac.type >= e1000_82580) hw->nvm.ops.read(hw, NVM_INIT_CONTROL3_PORT_A + NVM_82580_LAN_FUNC_OFFSET(hw->bus.func), 1, &eeprom_data); else if (hw->bus.func == 1) hw->nvm.ops.read(hw, NVM_INIT_CONTROL3_PORT_B, 1, &eeprom_data); if (eeprom_data & IGB_EEPROM_APME) adapter->flags |= IGB_FLAG_WOL_SUPPORTED; /* now that we have the eeprom settings, apply the special cases where * the eeprom may be wrong or the board simply won't support wake on * lan on a particular port */ switch (pdev->device) { case E1000_DEV_ID_82575GB_QUAD_COPPER: adapter->flags &= ~IGB_FLAG_WOL_SUPPORTED; break; case E1000_DEV_ID_82575EB_FIBER_SERDES: case E1000_DEV_ID_82576_FIBER: case E1000_DEV_ID_82576_SERDES: /* Wake events only supported on port A for dual fiber * regardless of eeprom setting */ if (rd32(E1000_STATUS) & E1000_STATUS_FUNC_1) adapter->flags &= ~IGB_FLAG_WOL_SUPPORTED; break; case E1000_DEV_ID_82576_QUAD_COPPER: case E1000_DEV_ID_82576_QUAD_COPPER_ET2: /* if quad port adapter, disable WoL on all but port A */ if (global_quad_port_a != 0) adapter->flags &= ~IGB_FLAG_WOL_SUPPORTED; else adapter->flags |= IGB_FLAG_QUAD_PORT_A; /* Reset for multiple quad port adapters */ if (++global_quad_port_a == 4) global_quad_port_a = 0; break; default: /* If the device can't wake, don't set software support */ if (!device_can_wakeup(&adapter->pdev->dev)) adapter->flags &= ~IGB_FLAG_WOL_SUPPORTED; } /* initialize the wol settings based on the eeprom settings */ if (adapter->flags & IGB_FLAG_WOL_SUPPORTED) adapter->wol |= E1000_WUFC_MAG; /* Some vendors want WoL disabled by default, but still supported */ if ((hw->mac.type == e1000_i350) && (pdev->subsystem_vendor == PCI_VENDOR_ID_HP)) { adapter->flags |= IGB_FLAG_WOL_SUPPORTED; adapter->wol = 0; } device_set_wakeup_enable(&adapter->pdev->dev, adapter->flags & IGB_FLAG_WOL_SUPPORTED); /* reset the hardware with the new settings */ igb_reset(adapter); /* Init the I2C interface 还有个I2C的控制接口*/ err = igb_init_i2c(adapter); if (err) { dev_err(&pdev->dev, "failed to init i2c interface\n"); goto err_eeprom; } /* let the f/w know that the h/w is now under the control of the * driver. */ igb_get_hw_control(adapter); strcpy(netdev->name, "eth%d"); err = register_netdev(netdev); // 注册网络设备 if (err) goto err_register; /* carrier off reporting is important to ethtool even BEFORE open */ netif_carrier_off(netdev); #ifdef CONFIG_IGB_DCA if (dca_add_requester(&pdev->dev) == 0) { adapter->flags |= IGB_FLAG_DCA_ENABLED; dev_info(&pdev->dev, "DCA enabled\n"); igb_setup_dca(adapter); } #endif #ifdef CONFIG_IGB_HWMON /* Initialize the thermal sensor on i350 devices. */ if (hw->mac.type == e1000_i350 && hw->bus.func == 0) { u16 ets_word; /* Read the NVM to determine if this i350 device supports an * external thermal sensor. */ hw->nvm.ops.read(hw, NVM_ETS_CFG, 1, &ets_word); if (ets_word != 0x0000 && ets_word != 0xFFFF) adapter->ets = true; else adapter->ets = false; if (igb_sysfs_init(adapter)) dev_err(&pdev->dev, "failed to allocate sysfs resources\n"); } else { adapter->ets = false; } #endif /* Check if Media Autosense is enabled */ adapter->ei = *ei; if (hw->dev_spec._82575.mas_capable) igb_init_mas(adapter); /* do hw tstamp init after resetting */ igb_ptp_init(adapter); dev_info(&pdev->dev, "Intel(R) Gigabit Ethernet Network Connection\n"); /* print bus type/speed/width info, not applicable to i354 */ if (hw->mac.type != e1000_i354) { dev_info(&pdev->dev, "%s: (PCIe:%s:%s) %pM\n", netdev->name, ((hw->bus.speed == e1000_bus_speed_2500) ? "2.5Gb/s" : (hw->bus.speed == e1000_bus_speed_5000) ? "5.0Gb/s" : "unknown"), ((hw->bus.width == e1000_bus_width_pcie_x4) ? "Width x4" : (hw->bus.width == e1000_bus_width_pcie_x2) ? "Width x2" : (hw->bus.width == e1000_bus_width_pcie_x1) ? "Width x1" : "unknown"), netdev->dev_addr); } if ((hw->mac.type >= e1000_i210 || igb_get_flash_presence_i210(hw))) { ret_val = igb_read_part_string(hw, part_str, E1000_PBANUM_LENGTH); } else { ret_val = -E1000_ERR_INVM_VALUE_NOT_FOUND; } if (ret_val) strcpy(part_str, "Unknown"); dev_info(&pdev->dev, "%s: PBA No: %s\n", netdev->name, part_str); dev_info(&pdev->dev, "Using %s interrupts. %d rx queue(s), %d tx queue(s)\n", (adapter->flags & IGB_FLAG_HAS_MSIX) ? "MSI-X" : (adapter->flags & IGB_FLAG_HAS_MSI) ? "MSI" : "legacy", adapter->num_rx_queues, adapter->num_tx_queues); if (hw->phy.media_type == e1000_media_type_copper) { switch (hw->mac.type) { case e1000_i350: case e1000_i210: case e1000_i211: /* Enable EEE for internal copper PHY devices */ err = igb_set_eee_i350(hw, true, true); if ((!err) && (!hw->dev_spec._82575.eee_disable)) { adapter->eee_advert = MDIO_EEE_100TX | MDIO_EEE_1000T; adapter->flags |= IGB_FLAG_EEE; } break; case e1000_i354: if ((rd32(E1000_CTRL_EXT) & E1000_CTRL_EXT_LINK_MODE_SGMII)) { err = igb_set_eee_i354(hw, true, true); if ((!err) && (!hw->dev_spec._82575.eee_disable)) { adapter->eee_advert = MDIO_EEE_100TX | MDIO_EEE_1000T; adapter->flags |= IGB_FLAG_EEE; } } break; default: break; } } pm_runtime_put_noidle(&pdev->dev); return 0; err_register: igb_release_hw_control(adapter); memset(&adapter->i2c_adap, 0, sizeof(adapter->i2c_adap)); err_eeprom: if (!igb_check_reset_block(hw)) igb_reset_phy(hw); if (hw->flash_address) iounmap(hw->flash_address); err_sw_init: kfree(adapter->shadow_vfta); igb_clear_interrupt_scheme(adapter); #ifdef CONFIG_PCI_IOV igb_disable_sriov(pdev); #endif pci_iounmap(pdev, adapter->io_addr); err_ioremap: free_netdev(netdev); err_alloc_etherdev: pci_release_selected_regions(pdev, pci_select_bars(pdev, IORESOURCE_MEM)); err_pci_reg: err_dma: pci_disable_device(pdev); return err; }
第5步中我们看到,网卡驱动实现了ethtool所需要的接口,也在这里注册完成函数地址的注册。当 ethtool 发起一个系统调用之后,内核会找到对应操作的回调函数。对于igb网卡来说,其实现函数都在drivers/net/ethernet/intel/igb/igb_ethtool.c下。相信你这次能彻底理解ethtool的工作原理了吧?这个命令之所以能查看网卡收发包统计、能修改网卡自适应模式、能调整RX 队列的数量和大小,是因为ethtool命令最终调用到了网卡驱动的相应方法,而不是ethtool本身有这个超能力。
第6步注册的igb_netdev_ops中包含的是igb_open等函数,该函数在网卡被启动的时候会被调用。
//file: drivers/net/ethernet/intel/igb/igb_main.c staticconststruct net_device_ops igb_netdev_ops = { .ndo_open = igb_open, .ndo_stop = igb_close, .ndo_start_xmit = igb_xmit_frame, .ndo_get_stats64 = igb_get_stats64, .ndo_set_rx_mode = igb_set_rx_mode, .ndo_set_mac_address = igb_set_mac, .ndo_change_mtu = igb_change_mtu, .ndo_do_ioctl = igb_ioctl, ......
第7步中,在igb_probe初始化过程中,还调用到了igb_alloc_q_vector。他注册了一个NAPI机制所必须的poll函数,对于igb网卡驱动来说,这个函数就是igb_poll,如下代码所示。
static int igb_alloc_q_vector(struct igb_adapter *adapter, int v_count, int v_idx, int txr_count, int txr_idx, int rxr_count, int rxr_idx){ ...... /* initialize NAPI */ netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll, 64); }