Linux native multipath provides multipathing capability in Linux. It gives the benefits of increased throughput and path failover.
Many details for it has been given in https://www.kernel.org/doc/ols/2005/ols2005v1-pages-155-176.pdf
In this document we will go through the various user/kernel components of Linux native multipath.
The major user space components are libdevmapper shared library and multipathd daemon. The kernel modules of dm-mod and dm-multipath. Below Lower Level Device Driver of libiscsi and scsi_transport_iscsi handle the submission to wire.
MULTIPATHD :
Lets first see the multipathd.
multipathd is installed in Linux systems like a service. The service start starts the daemon /etc/multipathd
Like any C program the main function of multipathd is invoked :
main --> Calls child
--> init_checkers
--> add_checker adds DIRECTIO checker
--> init_prio adds DEFAULT_PRIO
--> signal_init
--> set_oom_adj -- adjusts the OOM
Then it invokes 4 pthreads :
1. ueventloop --- Listens for the UDEV event in uevent_listen function. This adds the udev events to the queue
2. uxlsnrloop --- uxsock_listen waits for commands from users
3. checkerloop --- does check_path for all the available paths
4. uevqloop --- This takes out the events oushed in the queue.
Basically ueventloop listens for the events and uevqloop processes it.
uevqloop processes the UDEV event and eventually calls :
uev_trigger
--> uev_add_path is called for any ADD event
--> calls ev_add_path
--> calls domap
domap function for a new device creation calls dm_addmap_create
dm_addmap(DM_DEVICE_CREATE, TGT_MPATH, mpp, params, 1, ro);
will call dm_task_create which is a libdevmapper function. This is similar to calling a dmsetup create command.
This creation will trigger an ioctl to the kernel space. This is done to device /dev/mapper/control device.
Linux Kernel life of the device creation :
Linux Kernel has the following misc driver created for the ioctl communication:
#ifdef CONFIG_COMPAT
static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
{
return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u));
}
#else
#define dm_compat_ctl_ioctl NULL
#endif
static const struct file_operations _ctl_fops = {
.open = nonseekable_open,
.unlocked_ioctl = dm_ctl_ioctl,
.compat_ioctl = dm_compat_ctl_ioctl,
.owner = THIS_MODULE,
.llseek = noop_llseek,
};
static struct miscdevice _dm_misc = {
.minor = MAPPER_CTRL_MINOR,
.name = DM_NAME,
.nodename = DM_DIR "/" DM_CONTROL_NODE,
.fops = &_ctl_fops
};
dm_ctl_ioctl calls ctl_ioctl :
--> lookup_ioctl -- fetches the function corresponding to the device creation.
{DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}
dev_create calls dm_create
--> alloc_dev
--> dm_init_md_queue(md);
--> md->disk->fops = &dm_blk_dops;
static const struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
.direct_access = dm_blk_direct_access,
.getgeo = dm_blk_getgeo,
.pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
Creation of /dev/dm-X device :
Lets trace the creation of /dev/dm-X device.
Our favourite commands for the Ftrace :)
cd /sys/kernel/debug/tracing
cat /dev/null > trace
echo dev_create > set_graph_function
echo 10 > max_graph_depth
echo function_graph > current_tracer
echo 1 > tracing_on
dmsetup create my_dev --table "0 14680064 multipath 2 queue_if_no_path retain_attached_hw_handler 1 alua 2 1 service-time 0 1 2 65:176 1 1 service-time 0 1 2 65:192 1 1"
cp trace ~/create_dev_depth10
echo 0 > tracing_on
echo > set_graph_function
echo 0 > max_graph_depth
cat /dev/null > trace
Here is the trace :
1) | dev_create [dm_mod]() {
1) | dm_create [dm_mod]() {
1) 0.035 us | dm_use_blk_mq_default [dm_mod]();
1) 0.041 us | __mutex_init();
1) 0.040 us | __mutex_init();
1) 0.037 us | __mutex_init();
1) | blk_alloc_queue_node() {
1) | blkcg_init_queue() {
1) 0.037 us | _cond_resched();
1) | blk_throtl_init() {
1) | kmem_cache_alloc_node_trace() {
1) 0.042 us | _cond_resched();
1) 0.482 us | }
1) 0.036 us | init_timer_key();
1) | blkcg_activate_policy() {
1) 0.038 us | blkcg_policy_enabled();
1) | blkg_alloc() {
1) | alloc_disk_node() {
1) | device_initialize() {
1) 0.035 us | __mutex_init();
1) | device_pm_sleep_init() {
1) 0.034 us | __init_waitqueue_head();
1) | complete_all() {
1) 0.036 us | _raw_spin_lock_irqsave();
1) 0.045 us | __wake_up_common();
1) 0.046 us | _raw_spin_unlock_irqrestore();
1) 0.980 us | }
1) 1.645 us | }
1) | pm_runtime_init() {
1) 0.035 us | init_timer_key();
1) 0.035 us | __init_waitqueue_head();
1) 0.732 us | }
1) 9.326 us | }
1) + 17.547 us | }
1) 0.035 us | __init_waitqueue_head();
1) 0.034 us | __init_waitqueue_head();
1) 0.035 us | __init_waitqueue_head();
1) | add_disk() { ---------- This will send UDEV of KOBJ_ADD for creation of /dev/dm-X device
1) 0.044 us | blk_alloc_devt();
1) | bdi_register_dev() {
1) | bdget_disk() {
1) | disk_get_part() {
1) 0.075 us | get_device();
1) 0.575 us | }
1) | bdget() {
1) | iget5_locked() {
1) 0.130 us | _raw_spin_lock();
1) 0.285 us | find_inode();
1) | alloc_inode() {
1) | dm_stats_init [dm_mod]() {
1) 0.035 us | __mutex_init();
1) | __alloc_percpu() {
1) | pcpu_alloc() {
1) | dm_sysfs_init [dm_mod]() {
1) 0.035 us | dm_disk [dm_mod]();
1) 0.036 us | dm_kobject [dm_mod]();
{DM_TABLE_LOAD_CMD, 0, table_load}
table_load function is called for initialising the device.
r = dm_setup_md_queue(md, t);
dm_setup_md_queue -- calls dm_old_init_request_queue
Here request_fn for the device mapper is completely initialized.
blk_init_allocated_queue assigns the dm_request_fn
q->request_fn = rfn;
make_request_fn is initialised to blk_queue_bio
Now the function dm_init_normal_md_queue is called. This function initialises the softirq_done_fn with dm_softirq_done.
Also prep_rq_fn is initialized to dm_old_prep_fn.
At last this function calls elv_register_queue.
Yeah, why to look code flow we can ftrace it!! Here we go with the ftrace calls for table_load.
1) | table_load [dm_mod]() {
1) | find_device [dm_mod]() {
1) | down_read() {
1) 0.073 us | _cond_resched();
1) 0.583 us | }
1) | __find_device_hash_cell [dm_mod]() {
1) | __get_name_cell [dm_mod]() {
1) 0.123 us | dm_get [dm_mod]();
1) 0.757 us | }
1) 1.399 us | }
1) 0.045 us | up_read();
1) 3.060 us | }
1) | dm_table_create [dm_mod]() {
1) | kmem_cache_alloc_trace() {
1) 0.040 us | _cond_resched();
1) 0.499 us | }
1) | dm_table_add_target [dm_mod]() {
1) | dm_get_target_type [dm_mod]() {
1) | get_target_type [dm_mod]() {
1) | down_read() {
1) 0.042 us | _cond_resched();
1) 0.521 us | }
1) 0.310 us | try_module_get();
1) 0.045 us | up_read();
1) 3.042 us | }
1) 3.575 us | }
1) | dm_split_args [dm_mod]() {
1) | realloc_argv [dm_mod]() {
1) | __kmalloc() {
1) 0.058 us | kmalloc_slab();
1) 0.040 us | _cond_resched();
1) 0.776 us | }
1) 0.051 us | kfree();
1) 1.684 us | }
1) 4.387 us | }
1) | multipath_ctr [dm_multipath]() {
1) 0.121 us | dm_get_reserved_rq_based_ios [dm_mod]();
1) | kmem_cache_alloc_trace() {
1) 0.041 us | _cond_resched();
1) 0.416 us | }
1) | ql_create [dm_queue_length]() {
1) | kmem_cache_alloc_trace() {
1) 0.040 us | _cond_resched();
1) 0.475 us | }
1) 0.807 us | }
1) | ql_add_path [dm_queue_length]() {
1) | kmem_cache_alloc_trace() {
1) 0.042 us | _cond_resched();
1) 0.391 us | }
1) 0.951 us | }
1) 0.041 us | dm_consume_args [dm_mod]();
1) ! 539.424 us | }
1) 0.134 us | kfree();
1) ! 549.508 us | }
0) 0.066 us | dm_get_immutable_target_type [dm_mod]();
0) 0.093 us | dm_get_md_type [dm_mod]();
0) 0.049 us | dm_table_get_type [dm_mod]();
0) 0.044 us | dm_set_md_type [dm_mod]();
0) | dm_setup_md_queue [dm_mod]() {
0) | blk_init_allocated_queue() {
0) | kmem_cache_alloc_trace() {
0) 0.042 us | _cond_resched();
0) 0.571 us | }
0) | blk_init_rl() {
0) | elevator_init() {
0) | elevator_get() {
0) 0.087 us | _raw_spin_lock();
0) 0.391 us | elevator_find();
0) 0.051 us | try_module_get();
0) 0.040 us | _raw_spin_unlock();
0) 1.862 us | }
0) | deadline_init_queue() {
0) | elevator_alloc() {
0) | kmem_cache_alloc_node_trace() {
0) 0.041 us | _cond_resched();
0) 0.648 us | }
0) 0.042 us | __mutex_init();
0) 1.568 us | }
0) | kmem_cache_alloc_node_trace() {
0) 0.041 us | _cond_resched();
0) 0.525 us | }
0) 0.044 us | _raw_spin_lock_irq();
0) 3.691 us | }
0) + 45.638 us | }
0) 0.047 us | mutex_unlock();
0) + 62.761 us | }
0) | dm_init_md_queue [dm_mod]() {
0) | blk_queue_make_request() {
0) 0.042 us | blk_queue_congestion_threshold();
0) 0.044 us | blk_queue_bounce_limit();
0) 0.721 us | }
0) 0.044 us | blk_queue_bounce_limit();
0) 0.042 us | blk_queue_merge_bvec();
0) 1.827 us | }
0) 0.043 us | blk_queue_softirq_done();
0) 0.042 us | blk_queue_prep_rq();
0) 0.042 us | blk_queue_lld_busy();
0) | elv_register_queue() {
0) | down_write() {
0) 0.040 us | _cond_resched();
0) 0.407 us | }
0) 0.040 us | dm_get_mdptr [dm_mod]();
0) 0.047 us | up_write();
0) | __dev_status [dm_mod]() {
0) 0.040 us | dm_disk [dm_mod]();
0) 0.040 us | dm_suspended_md [dm_mod]();
0) 0.040 us | dm_test_deferred_remove_flag [dm_mod]();
0) 0.042 us | dm_open_count [dm_mod]();
0) 0.047 us | dm_get_event_nr [dm_mod]();
0) | dm_get_live_table [dm_mod]() {
0) 0.134 us | __srcu_read_lock();
0) 0.478 us | }
0) | dm_put_live_table [dm_mod]() {
0) 0.069 us | __srcu_read_unlock();
0) 0.423 us | }
0) 3.391 us | }
0) 0.045 us | dm_put [dm_mod]();
0) ! 3115.589 us | } /* table_load [dm_mod] */
IO Flow:
Why are we doing all this, Yes for the IO flow.
Lets see how the IO Flows through our devmapper device. Now I am tracing dm_request_fn.
0) | dm_request_fn [dm_mod]() {
0) 0.240 us | __srcu_read_lock();
0) | blk_peek_request() {
0) | noop_dispatch() {
0) 0.150 us | elv_dispatch_sort();
0) 0.786 us | }
0) | dm_prep_fn [dm_mod]() {
0) | mempool_alloc() {
0) | mempool_alloc_slab() {
0) 0.412 us | kmem_cache_alloc();
0) 1.019 us | }
0) 1.589 us | }
0) | blk_rq_prep_clone() {
0) 0.280 us | blk_rq_init();
0) 0.160 us | dm_table_find_target [dm_mod]();
0) | multipath_busy [dm_multipath]() {
0) 0.157 us | _raw_spin_lock_irqsave();
0) | dm_underlying_device_busy [dm_mod]() {
0) | blk_lld_busy() {
0) 0.193 us | scsi_lld_busy();
0) 1.056 us | }
0) 1.905 us | }
0) 0.096 us | _raw_spin_unlock_irqrestore();
0) 5.042 us | }
0) | dm_start_request [dm_mod]() {
0) | blk_start_request() {
0) 0.160 us | blk_dequeue_request();
0) | blk_add_timer() {
0) 0.180 us | __blk_add_timer();
0) 1.032 us | }
0) 2.910 us | }
0) 0.150 us | dm_get [dm_mod]();
0) 4.795 us | }
0) 0.110 us | _raw_spin_unlock();
0) | multipath_map [dm_multipath]() {
0) | mempool_alloc() {
0) | mempool_alloc_slab() {
0) 0.193 us | kmem_cache_alloc();
0) 1.152 us | }
0) 2.015 us | }
0) | map_io [dm_multipath]() {
0) 0.107 us | _raw_spin_lock_irqsave();
0) 0.136 us | ql_start_io [dm_queue_length]();
0) 0.140 us | _raw_spin_unlock_irqrestore();
0) 3.407 us | }
0) 7.233 us | }
0) | dm_dispatch_request [dm_mod]() {
0) | blk_insert_cloned_request() {
0) 0.120 us | blk_rq_check_limits();
0) 0.103 us | _raw_spin_lock_irqsave();
0) | blk_account_io_start() {
0) 0.197 us | disk_map_sector_rcu();
0) | part_round_stats() {
0) 0.230 us | part_round_stats_single();
0) 1.072 us | }
0) 2.854 us | }
0) | __elv_add_request() {
0) | elv_drain_elevator() {
0) 0.183 us | deadline_dispatch_requests();
0) 1.003 us | }
0) | __blk_run_queue() {
0) | scsi_request_fn() {
0) 0.277 us | get_device();
0) | blk_peek_request() {
0) | scsi_prep_fn() {
0) | scsi_get_command() {
0) 0.100 us | get_device();
0) | __scsi_get_command() {
0) | scsi_host_alloc_command() {
0) | scsi_pool_alloc_command() {
0) 0.399 us | kmem_cache_alloc();
0) | kmem_cache_alloc() {
0) 0.922 us | __slab_alloc();
0) 1.871 us | }
0) 3.830 us | }
0) 4.778 us | }
0) 5.584 us | }
0) 0.163 us | init_timer_key();
0) 0.107 us | _raw_spin_lock_irqsave();
0) 0.110 us | _raw_spin_unlock_irqrestore();
0) 9.678 us | }
0) | sd_init_command [sd_mod]() {
0) | scsi_setup_fs_cmnd() {
0) 0.124 us | alua_prep_fn();
0) | scsi_init_io() {
0) | scsi_init_sgtable() {
0) | scsi_alloc_sgtable() {
0) | scsi_sg_alloc() {
0) | mempool_alloc() {
0) | mempool_alloc_slab() {
0) 0.300 us | kmem_cache_alloc();
0) 1.112 us | }
0) 2.161 us | }
0) 2.964 us | }
0) 4.269 us | }
0) | blk_start_request() {
0) 0.210 us | blk_dequeue_request();
0) | blk_add_timer() {
0) | __blk_add_timer() {
0) 0.113 us | round_jiffies_up();
0) 1.482 us | }
0) 2.410 us | }
0) 4.226 us | }
0) 0.093 us | _raw_spin_unlock();
0) 0.087 us | _raw_spin_lock();
0) 0.403 us | scsi_init_cmd_errh();
0) | scsi_dispatch_cmd() {
0) 0.230 us | scsi_log_send();
0) | iscsi_queuecommand [libiscsi]() {
0) | _raw_spin_lock_bh() {
0) 0.074 us | local_bh_disable();
0) 0.702 us | }
0) | iscsi_session_chkready [scsi_transport_iscsi]() {
0) 0.086 us | _raw_spin_lock_irqsave();
0) 0.187 us | _raw_spin_unlock_irqrestore();
0) 1.562 us | }
0) 0.123 us | queue_work_on();
0) | _raw_spin_unlock_bh() {
0) 0.246 us | local_bh_enable_ip();
0) 0.839 us | }
0) + 17.190 us | }
0) + 19.251 us | }
0) 0.077 us | _raw_spin_lock_irq();
0) | blk_peek_request() {
0) 0.179 us | deadline_dispatch_requests();
0) 0.873 us | }
0) 0.130 us | put_device();
0) 0.080 us | _raw_spin_lock_irq();
0) + 87.287 us | }
0) + 87.941 us | }
0) + 90.672 us | }
0) 0.096 us | _raw_spin_unlock_irqrestore();
0) + 97.425 us | }
0) + 98.507 us | }
0) 0.077 us | _raw_spin_lock();
0) | blk_peek_request() {
0) 0.259 us | noop_dispatch();
0) 1.063 us | }
0) | blk_delay_queue() {
0) 0.080 us | msecs_to_jiffies();
0) 0.163 us | queue_delayed_work_on();
0) 1.305 us | }
0) 0.169 us | __srcu_read_unlock();
0) ! 290.409 us | }
Many details for it has been given in https://www.kernel.org/doc/ols/2005/ols2005v1-pages-155-176.pdf
In this document we will go through the various user/kernel components of Linux native multipath.
The major user space components are libdevmapper shared library and multipathd daemon. The kernel modules of dm-mod and dm-multipath. Below Lower Level Device Driver of libiscsi and scsi_transport_iscsi handle the submission to wire.
MULTIPATHD :
Lets first see the multipathd.
multipathd is installed in Linux systems like a service. The service start starts the daemon /etc/multipathd
Like any C program the main function of multipathd is invoked :
main --> Calls child
--> init_checkers
--> add_checker adds DIRECTIO checker
--> init_prio adds DEFAULT_PRIO
--> signal_init
--> set_oom_adj -- adjusts the OOM
Then it invokes 4 pthreads :
1. ueventloop --- Listens for the UDEV event in uevent_listen function. This adds the udev events to the queue
2. uxlsnrloop --- uxsock_listen waits for commands from users
3. checkerloop --- does check_path for all the available paths
4. uevqloop --- This takes out the events oushed in the queue.
Basically ueventloop listens for the events and uevqloop processes it.
uevqloop processes the UDEV event and eventually calls :
uev_trigger
--> uev_add_path is called for any ADD event
--> calls ev_add_path
--> calls domap
domap function for a new device creation calls dm_addmap_create
dm_addmap(DM_DEVICE_CREATE, TGT_MPATH, mpp, params, 1, ro);
will call dm_task_create which is a libdevmapper function. This is similar to calling a dmsetup create command.
This creation will trigger an ioctl to the kernel space. This is done to device /dev/mapper/control device.
Linux Kernel life of the device creation :
Linux Kernel has the following misc driver created for the ioctl communication:
#ifdef CONFIG_COMPAT
static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
{
return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u));
}
#else
#define dm_compat_ctl_ioctl NULL
#endif
static const struct file_operations _ctl_fops = {
.open = nonseekable_open,
.unlocked_ioctl = dm_ctl_ioctl,
.compat_ioctl = dm_compat_ctl_ioctl,
.owner = THIS_MODULE,
.llseek = noop_llseek,
};
static struct miscdevice _dm_misc = {
.minor = MAPPER_CTRL_MINOR,
.name = DM_NAME,
.nodename = DM_DIR "/" DM_CONTROL_NODE,
.fops = &_ctl_fops
};
dm_ctl_ioctl calls ctl_ioctl :
--> lookup_ioctl -- fetches the function corresponding to the device creation.
{DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}
dev_create calls dm_create
--> alloc_dev
--> dm_init_md_queue(md);
--> md->disk->fops = &dm_blk_dops;
static const struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
.direct_access = dm_blk_direct_access,
.getgeo = dm_blk_getgeo,
.pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
Creation of /dev/dm-X device :
Lets trace the creation of /dev/dm-X device.
Our favourite commands for the Ftrace :)
cd /sys/kernel/debug/tracing
cat /dev/null > trace
echo dev_create > set_graph_function
echo 10 > max_graph_depth
echo function_graph > current_tracer
echo 1 > tracing_on
dmsetup create my_dev --table "0 14680064 multipath 2 queue_if_no_path retain_attached_hw_handler 1 alua 2 1 service-time 0 1 2 65:176 1 1 service-time 0 1 2 65:192 1 1"
cp trace ~/create_dev_depth10
echo 0 > tracing_on
echo > set_graph_function
echo 0 > max_graph_depth
cat /dev/null > trace
Here is the trace :
1) | dev_create [dm_mod]() {
1) | dm_create [dm_mod]() {
1) 0.035 us | dm_use_blk_mq_default [dm_mod]();
1) 0.041 us | __mutex_init();
1) 0.040 us | __mutex_init();
1) 0.037 us | __mutex_init();
1) | blk_alloc_queue_node() {
1) | blkcg_init_queue() {
1) 0.037 us | _cond_resched();
1) | blk_throtl_init() {
1) | kmem_cache_alloc_node_trace() {
1) 0.042 us | _cond_resched();
1) 0.482 us | }
1) 0.036 us | init_timer_key();
1) | blkcg_activate_policy() {
1) 0.038 us | blkcg_policy_enabled();
1) | blkg_alloc() {
1) | alloc_disk_node() {
1) | device_initialize() {
1) 0.035 us | __mutex_init();
1) | device_pm_sleep_init() {
1) 0.034 us | __init_waitqueue_head();
1) | complete_all() {
1) 0.036 us | _raw_spin_lock_irqsave();
1) 0.045 us | __wake_up_common();
1) 0.046 us | _raw_spin_unlock_irqrestore();
1) 0.980 us | }
1) 1.645 us | }
1) | pm_runtime_init() {
1) 0.035 us | init_timer_key();
1) 0.035 us | __init_waitqueue_head();
1) 0.732 us | }
1) 9.326 us | }
1) + 17.547 us | }
1) 0.035 us | __init_waitqueue_head();
1) 0.034 us | __init_waitqueue_head();
1) 0.035 us | __init_waitqueue_head();
1) | add_disk() { ---------- This will send UDEV of KOBJ_ADD for creation of /dev/dm-X device
1) 0.044 us | blk_alloc_devt();
1) | bdi_register_dev() {
1) | bdget_disk() {
1) | disk_get_part() {
1) 0.075 us | get_device();
1) 0.575 us | }
1) | bdget() {
1) | iget5_locked() {
1) 0.130 us | _raw_spin_lock();
1) 0.285 us | find_inode();
1) | alloc_inode() {
1) | dm_stats_init [dm_mod]() {
1) 0.035 us | __mutex_init();
1) | __alloc_percpu() {
1) | pcpu_alloc() {
1) | dm_sysfs_init [dm_mod]() {
1) 0.035 us | dm_disk [dm_mod]();
1) 0.036 us | dm_kobject [dm_mod]();
{DM_TABLE_LOAD_CMD, 0, table_load}
table_load function is called for initialising the device.
r = dm_setup_md_queue(md, t);
dm_setup_md_queue -- calls dm_old_init_request_queue
Here request_fn for the device mapper is completely initialized.
blk_init_allocated_queue assigns the dm_request_fn
q->request_fn = rfn;
make_request_fn is initialised to blk_queue_bio
Now the function dm_init_normal_md_queue is called. This function initialises the softirq_done_fn with dm_softirq_done.
Also prep_rq_fn is initialized to dm_old_prep_fn.
At last this function calls elv_register_queue.
Yeah, why to look code flow we can ftrace it!! Here we go with the ftrace calls for table_load.
1) | table_load [dm_mod]() {
1) | find_device [dm_mod]() {
1) | down_read() {
1) 0.073 us | _cond_resched();
1) 0.583 us | }
1) | __find_device_hash_cell [dm_mod]() {
1) | __get_name_cell [dm_mod]() {
1) 0.123 us | dm_get [dm_mod]();
1) 0.757 us | }
1) 1.399 us | }
1) 0.045 us | up_read();
1) 3.060 us | }
1) | dm_table_create [dm_mod]() {
1) | kmem_cache_alloc_trace() {
1) 0.040 us | _cond_resched();
1) 0.499 us | }
1) | dm_table_add_target [dm_mod]() {
1) | dm_get_target_type [dm_mod]() {
1) | get_target_type [dm_mod]() {
1) | down_read() {
1) 0.042 us | _cond_resched();
1) 0.521 us | }
1) 0.310 us | try_module_get();
1) 0.045 us | up_read();
1) 3.042 us | }
1) 3.575 us | }
1) | dm_split_args [dm_mod]() {
1) | realloc_argv [dm_mod]() {
1) | __kmalloc() {
1) 0.058 us | kmalloc_slab();
1) 0.040 us | _cond_resched();
1) 0.776 us | }
1) 0.051 us | kfree();
1) 1.684 us | }
1) 4.387 us | }
1) | multipath_ctr [dm_multipath]() {
1) 0.121 us | dm_get_reserved_rq_based_ios [dm_mod]();
1) | kmem_cache_alloc_trace() {
1) 0.041 us | _cond_resched();
1) 0.416 us | }
1) | ql_create [dm_queue_length]() {
1) | kmem_cache_alloc_trace() {
1) 0.040 us | _cond_resched();
1) 0.475 us | }
1) 0.807 us | }
1) | ql_add_path [dm_queue_length]() {
1) | kmem_cache_alloc_trace() {
1) 0.042 us | _cond_resched();
1) 0.391 us | }
1) 0.951 us | }
1) 0.041 us | dm_consume_args [dm_mod]();
1) ! 539.424 us | }
1) 0.134 us | kfree();
1) ! 549.508 us | }
0) 0.066 us | dm_get_immutable_target_type [dm_mod]();
0) 0.093 us | dm_get_md_type [dm_mod]();
0) 0.049 us | dm_table_get_type [dm_mod]();
0) 0.044 us | dm_set_md_type [dm_mod]();
0) | dm_setup_md_queue [dm_mod]() {
0) | blk_init_allocated_queue() {
0) | kmem_cache_alloc_trace() {
0) 0.042 us | _cond_resched();
0) 0.571 us | }
0) | blk_init_rl() {
0) | elevator_init() {
0) | elevator_get() {
0) 0.087 us | _raw_spin_lock();
0) 0.391 us | elevator_find();
0) 0.051 us | try_module_get();
0) 0.040 us | _raw_spin_unlock();
0) 1.862 us | }
0) | deadline_init_queue() {
0) | elevator_alloc() {
0) | kmem_cache_alloc_node_trace() {
0) 0.041 us | _cond_resched();
0) 0.648 us | }
0) 0.042 us | __mutex_init();
0) 1.568 us | }
0) | kmem_cache_alloc_node_trace() {
0) 0.041 us | _cond_resched();
0) 0.525 us | }
0) 0.044 us | _raw_spin_lock_irq();
0) 3.691 us | }
0) + 45.638 us | }
0) 0.047 us | mutex_unlock();
0) + 62.761 us | }
0) | dm_init_md_queue [dm_mod]() {
0) | blk_queue_make_request() {
0) 0.042 us | blk_queue_congestion_threshold();
0) 0.044 us | blk_queue_bounce_limit();
0) 0.721 us | }
0) 0.044 us | blk_queue_bounce_limit();
0) 0.042 us | blk_queue_merge_bvec();
0) 1.827 us | }
0) 0.043 us | blk_queue_softirq_done();
0) 0.042 us | blk_queue_prep_rq();
0) 0.042 us | blk_queue_lld_busy();
0) | elv_register_queue() {
0) | down_write() {
0) 0.040 us | _cond_resched();
0) 0.407 us | }
0) 0.040 us | dm_get_mdptr [dm_mod]();
0) 0.047 us | up_write();
0) | __dev_status [dm_mod]() {
0) 0.040 us | dm_disk [dm_mod]();
0) 0.040 us | dm_suspended_md [dm_mod]();
0) 0.040 us | dm_test_deferred_remove_flag [dm_mod]();
0) 0.042 us | dm_open_count [dm_mod]();
0) 0.047 us | dm_get_event_nr [dm_mod]();
0) | dm_get_live_table [dm_mod]() {
0) 0.134 us | __srcu_read_lock();
0) 0.478 us | }
0) | dm_put_live_table [dm_mod]() {
0) 0.069 us | __srcu_read_unlock();
0) 0.423 us | }
0) 3.391 us | }
0) 0.045 us | dm_put [dm_mod]();
0) ! 3115.589 us | } /* table_load [dm_mod] */
IO Flow:
Why are we doing all this, Yes for the IO flow.
Lets see how the IO Flows through our devmapper device. Now I am tracing dm_request_fn.
0) | dm_request_fn [dm_mod]() {
0) 0.240 us | __srcu_read_lock();
0) | blk_peek_request() {
0) | noop_dispatch() {
0) 0.150 us | elv_dispatch_sort();
0) 0.786 us | }
0) | dm_prep_fn [dm_mod]() {
0) | mempool_alloc() {
0) | mempool_alloc_slab() {
0) 0.412 us | kmem_cache_alloc();
0) 1.019 us | }
0) 1.589 us | }
0) | blk_rq_prep_clone() {
0) 0.280 us | blk_rq_init();
0) 0.160 us | dm_table_find_target [dm_mod]();
0) | multipath_busy [dm_multipath]() {
0) 0.157 us | _raw_spin_lock_irqsave();
0) | dm_underlying_device_busy [dm_mod]() {
0) | blk_lld_busy() {
0) 0.193 us | scsi_lld_busy();
0) 1.056 us | }
0) 1.905 us | }
0) 0.096 us | _raw_spin_unlock_irqrestore();
0) 5.042 us | }
0) | dm_start_request [dm_mod]() {
0) | blk_start_request() {
0) 0.160 us | blk_dequeue_request();
0) | blk_add_timer() {
0) 0.180 us | __blk_add_timer();
0) 1.032 us | }
0) 2.910 us | }
0) 0.150 us | dm_get [dm_mod]();
0) 4.795 us | }
0) 0.110 us | _raw_spin_unlock();
0) | multipath_map [dm_multipath]() {
0) | mempool_alloc() {
0) | mempool_alloc_slab() {
0) 0.193 us | kmem_cache_alloc();
0) 1.152 us | }
0) 2.015 us | }
0) | map_io [dm_multipath]() {
0) 0.107 us | _raw_spin_lock_irqsave();
0) 0.136 us | ql_start_io [dm_queue_length]();
0) 0.140 us | _raw_spin_unlock_irqrestore();
0) 3.407 us | }
0) 7.233 us | }
0) | dm_dispatch_request [dm_mod]() {
0) | blk_insert_cloned_request() {
0) 0.120 us | blk_rq_check_limits();
0) 0.103 us | _raw_spin_lock_irqsave();
0) | blk_account_io_start() {
0) 0.197 us | disk_map_sector_rcu();
0) | part_round_stats() {
0) 0.230 us | part_round_stats_single();
0) 1.072 us | }
0) 2.854 us | }
0) | __elv_add_request() {
0) | elv_drain_elevator() {
0) 0.183 us | deadline_dispatch_requests();
0) 1.003 us | }
0) | __blk_run_queue() {
0) | scsi_request_fn() {
0) 0.277 us | get_device();
0) | blk_peek_request() {
0) | scsi_prep_fn() {
0) | scsi_get_command() {
0) 0.100 us | get_device();
0) | __scsi_get_command() {
0) | scsi_host_alloc_command() {
0) | scsi_pool_alloc_command() {
0) 0.399 us | kmem_cache_alloc();
0) | kmem_cache_alloc() {
0) 0.922 us | __slab_alloc();
0) 1.871 us | }
0) 3.830 us | }
0) 4.778 us | }
0) 5.584 us | }
0) 0.163 us | init_timer_key();
0) 0.107 us | _raw_spin_lock_irqsave();
0) 0.110 us | _raw_spin_unlock_irqrestore();
0) 9.678 us | }
0) | sd_init_command [sd_mod]() {
0) | scsi_setup_fs_cmnd() {
0) 0.124 us | alua_prep_fn();
0) | scsi_init_io() {
0) | scsi_init_sgtable() {
0) | scsi_alloc_sgtable() {
0) | scsi_sg_alloc() {
0) | mempool_alloc() {
0) | mempool_alloc_slab() {
0) 0.300 us | kmem_cache_alloc();
0) 1.112 us | }
0) 2.161 us | }
0) 2.964 us | }
0) 4.269 us | }
0) | blk_start_request() {
0) 0.210 us | blk_dequeue_request();
0) | blk_add_timer() {
0) | __blk_add_timer() {
0) 0.113 us | round_jiffies_up();
0) 1.482 us | }
0) 2.410 us | }
0) 4.226 us | }
0) 0.093 us | _raw_spin_unlock();
0) 0.087 us | _raw_spin_lock();
0) 0.403 us | scsi_init_cmd_errh();
0) | scsi_dispatch_cmd() {
0) 0.230 us | scsi_log_send();
0) | iscsi_queuecommand [libiscsi]() {
0) | _raw_spin_lock_bh() {
0) 0.074 us | local_bh_disable();
0) 0.702 us | }
0) | iscsi_session_chkready [scsi_transport_iscsi]() {
0) 0.086 us | _raw_spin_lock_irqsave();
0) 0.187 us | _raw_spin_unlock_irqrestore();
0) 1.562 us | }
0) 0.123 us | queue_work_on();
0) | _raw_spin_unlock_bh() {
0) 0.246 us | local_bh_enable_ip();
0) 0.839 us | }
0) + 17.190 us | }
0) + 19.251 us | }
0) 0.077 us | _raw_spin_lock_irq();
0) | blk_peek_request() {
0) 0.179 us | deadline_dispatch_requests();
0) 0.873 us | }
0) 0.130 us | put_device();
0) 0.080 us | _raw_spin_lock_irq();
0) + 87.287 us | }
0) + 87.941 us | }
0) + 90.672 us | }
0) 0.096 us | _raw_spin_unlock_irqrestore();
0) + 97.425 us | }
0) + 98.507 us | }
0) 0.077 us | _raw_spin_lock();
0) | blk_peek_request() {
0) 0.259 us | noop_dispatch();
0) 1.063 us | }
0) | blk_delay_queue() {
0) 0.080 us | msecs_to_jiffies();
0) 0.163 us | queue_delayed_work_on();
0) 1.305 us | }
0) 0.169 us | __srcu_read_unlock();
0) ! 290.409 us | }
No comments:
Post a Comment