Articles in this blog

Thursday, 17 December 2015

Linux Device Model Internals


In this illustration I will be showing how bus_type, device_driver and device structures are linked. It also shows how the match and probe function of device driver is called for devices. I used Linux kernel 3.19 for making this simple program.

This program tries to 
1. Register a bus. 
2. Register a device_driver to that bus and 
3. Add a device_driver device to the driver.

At the last step the match and probe functions of the device driver are called.
In the probe function I am creating a simple character device.


#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/major.h>
#include <linux/kdev_t.h>
#include <linux/fs.h>

#include <linux/device.h>
#include <linux/cdev.h>
#include <linux/slab.h>
#include <linux/semaphore.h>

#include <asm/uaccess.h>


dev_t my_devt;

char drv_name[] = "my_chr_drv";
struct class *my_class;
struct device *my_device;
struct device *my_chr_device;

char global_buf[20];

struct my_super_device
{
    struct cdev my_cdev;
    int count;
    struct semaphore my_sem;
};

struct my_super_device *my_super;

ssize_t my_read (struct file *myfile, char __user *my_buf, size_t len, loff_t *off)
{
    int ret;   
    //printk("len = %d\n",len);
    ret = copy_to_user(my_buf, global_buf, len);
    if(ret != 0)
        printk("not able to copy\n");
    printk("copied the data to user buffer\n");
    return len;
}

ssize_t my_write (struct file *my_file, const char __user *buf, size_t len, loff_t *off)
{
   
    int ret;   
    ret = copy_from_user(global_buf, buf, len);
    if(ret != 0)
        printk("not able to copy\n");
    printk("copied the data from user buffer\n");
    return len;
}

int my_mmap (struct file *my_file, struct vm_area_struct *my_vm)
{
    return 0;
}

int my_open (struct inode *my_inode, struct file *my_file)
{
    struct my_super_device *super_ptr;
    printk("open called\n");
    printk("cdev address = %p\n", my_inode->i_cdev);
   
    super_ptr = container_of(my_inode->i_cdev, struct my_super_device, my_cdev);   
    up(&(super_ptr->my_sem));
    printk("super address = %p\n", super_ptr);
    return 0;
}

int my_release (struct inode *my_inode, struct file *my_file)
{
    struct my_super_device *super_ptr;
    printk("release called\n");

    super_ptr = container_of(my_inode->i_cdev, struct my_super_device, my_cdev);   
    down(&(super_ptr->my_sem));
    return 0;
}

unsigned int my_poll (struct file *my_file, struct poll_table_struct *poll_table)
{
    return 0;
}

struct file_operations myfops = {
    .owner = THIS_MODULE,
    .read = my_read,
    .write = my_write,
    .poll = my_poll,
    .open = my_open,
    .release = my_release,
};

int my_bus_match(struct device *dev, struct device_driver *drv)
{
    printk("Entered %s\n",__func__);
    return 1;
}
int my_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
{
    printk("Entered %s\n",__func__);
    return 0;
}

void my_bus_release(struct device *dev)
{
    printk("Entered %s\n",__func__);
    return ;
}

struct device my_bus = {
        .init_name = "my_bus",
    .release = my_bus_release,
};

struct bus_type my_bus_type = {
    .name = "my_bus",
    .match = my_bus_match,
    .uevent = my_bus_uevent,
};

int my_driver_probe(struct device *dev)
{
    int ret;   

    printk("Entered %s\n",__func__);

        ret = alloc_chrdev_region(&my_devt, 0, 1, drv_name);
        if(ret == 0)
                printk("dev_t major is %d minor is %d\n", MAJOR(my_devt), MINOR(my_devt));

        my_class =  class_create(THIS_MODULE, "my_class");
        if(my_class != NULL)
                printk("class created as %s\n", my_class->name);

        my_chr_device = device_create(my_class, NULL, my_devt, NULL, "my_chr_drv");
        if(my_chr_device != NULL)
                printk("device created dev_t as  %d\n", my_chr_device->devt);

        my_super = kzalloc(sizeof(struct my_super_device), GFP_KERNEL);

        sema_init(&(my_super->my_sem), 1);

        cdev_init(&(my_super->my_cdev), &myfops);

        cdev_add(&(my_super->my_cdev), my_devt, 1);
    return 0;
}

int my_driver_remove(struct device *dev)
{
    printk("Entered %s\n",__func__);

    if(my_chr_device != NULL)
        device_destroy(my_class, my_devt);
    if(my_class != NULL)
        class_destroy(my_class);
    if(my_devt != 0)
        unregister_chrdev_region(my_devt,1);

    return 0;
}

struct device_driver my_dev_driver = {
    .owner = THIS_MODULE,
    .name = "my_chr_drv",
    .bus = &my_bus_type,
    .probe = my_driver_probe,
    .remove = my_driver_remove,
};

static int __init my_init(void)
{
     int ret = 0;

    printk("in init\n");

    /*Register the bus*/
    ret = device_register(&my_bus);
    if(ret == 0)
        printk("device registerd correctly\n");
    else
        printk("ret = %d\n",ret);

    ret = bus_register(&my_bus_type);
    if(ret == 0)
        printk("bus registered correctly\n");
    else
        printk("ret = %d\n",ret);

    /*add device for the device driver*/
    my_device = kzalloc(sizeof(struct device), GFP_KERNEL);
    device_initialize(my_device);

    my_device->parent = &my_bus;
    my_device->bus = &my_bus_type;
    my_device->init_name = "my_chr_dev";

    ret = device_add(my_device);
    if(ret == 0)
        printk("device added correctly\n");
    else
        printk("ret = %d\n",ret);

    /*Now register the driver*/
    ret = driver_register(&my_dev_driver);
    if(ret == 0)
        printk("driver registered correctly\n");
    else
        printk("ret = %d\n",ret);

    return 0;
}

static void __exit my_exit(void)
{
    printk("in exit\n");
    driver_unregister(&my_dev_driver);
    bus_unregister(&my_bus_type);
    device_unregister(&my_bus);
}

module_init(my_init);
module_exit(my_exit);
MODULE_AUTHOR("Kundan");
MODULE_LICENSE("GPL");



Here is the snipped ftrace for the device_add function of the driver taken from this program.




# tracer: function_graph
#
# CPU  DURATION                  FUNCTION CALLS
# |     |   |                     |   |   |   |
 1)               |  device_add() {
 1)               |    device_private_init() {
 1)               |      kmem_cache_alloc_trace() {
 1)   0.291 us    |        _cond_resched();
 1)   2.375 us    |      }
 1)   4.088 us    |    }
 1)               |    dev_set_name() {
 1)               |      __kmalloc_track_caller() {
 1)   0.240 us    |        kmalloc_slab();
 1)   0.220 us    |        _cond_resched();
 1)   3.277 us    |      }
 1)   0.231 us    |      kfree();
 1)   7.374 us    |    }
 1)   0.601 us    |    get_device_parent();
 1)   0.280 us    |    _raw_spin_lock();
 1)   0.210 us    |    _raw_spin_unlock();
 1)               |    sysfs_create_dir_ns() {
 1)               |        kernfs_add_one() {
 1)               |          mutex_lock() {
 1)   0.210 us    |            _cond_resched();
 1)   1.704 us    |          }
 1)   0.401 us    |          kernfs_name_hash();
 1)   0.511 us    |          kernfs_link_sibling();
 1)   0.291 us    |          mutex_unlock();
 1)               |          kernfs_activate() {
 1)               |            mutex_lock() {
 1)   0.210 us    |              _cond_resched();
 1)   1.733 us    |            }
 1)               |            kernfs_next_descendant_post() {
 1)   0.271 us    |              kernfs_leftmost_descendant();
 1)   1.903 us    |            }
 1)   0.220 us    |            kernfs_next_descendant_post();
 1)   0.261 us    |            mutex_unlock();
 1)   9.459 us    |          }
 1) + 19.107 us   |        }
 1) + 49.938 us   |      }
 1) + 51.360 us   |    }
 1)   0.251 us    |    kernfs_get();
 1)               |    acpi_platform_notify() {
 1)               |      acpi_get_bus_type() {
 1)               |        down_read() {
 1)   0.200 us    |          _cond_resched();
 1)   1.714 us    |        }
 1)   0.250 us    |        pci_acpi_bus_match();
 1)   0.221 us    |        usb_acpi_bus_match();
 1)   0.261 us    |        up_read();
 1) + 10.039 us   |      }
 1)   0.281 us    |      acpi_bind_one();
 1) + 13.626 us   |    }
 1)               |    device_create_file() {
 1) + 46.381 us   |    }
 1)   0.250 us    |    sysfs_create_groups();
 1)               |    bus_add_device() {
 1)               |      device_add_groups() {
 1)   0.220 us    |        sysfs_create_groups();
 1)   1.933 us    |      }
 1)               |      sysfs_create_link() {
 1)               |        sysfs_do_create_link_sd.isra.2() {
 1)   0.501 us    |    kfree();
 1)   0.581 us    |    kfree();
 1)               |    bus_probe_device() {
 1)               |      device_attach() {
 1)               |        mutex_lock() {
 1)   0.251 us    |          _cond_resched();
 1)   1.803 us    |        }
 1)               |        bus_for_each_drv() {
 1)   0.271 us    |          _raw_spin_lock();
 1)   0.221 us    |          _raw_spin_unlock();
 1)               |          __device_attach() {
 1)               |            my_bus_match [bus_final_1]() {
 1)               |              printk() {
 1)               |                vprintk_default() {
 1)               |                  vprintk_emit() {
 1)   0.291 us    |                    _raw_spin_lock();
 1)   0.496 us    |                    log_store();
 1)   0.085 us    |                    _raw_spin_unlock();
 1)   0.265 us    |                    console_trylock();
 1)   2.800 us    |                    console_unlock();
 1) + 15.946 us   |                  }
 1) + 17.038 us   |                }
 1) + 18.301 us   |              }
 1) + 19.412 us   |            }
 1)               |            driver_probe_device() {
 1)               |              pm_runtime_barrier() {
 1)   0.115 us    |                _raw_spin_lock_irq();
 1)   0.135 us    |                __pm_runtime_barrier();
 1)   1.393 us    |              }
 1)               |              pinctrl_bind_pins() {
 1)               |              my_driver_probe [bus_final_1]() {
 1)               |                printk() {
 1)               |                  vprintk_default() {
 1)   4.534 us    |                    vprintk_emit();
 1)   5.130 us    |                  }
 1)   5.711 us    |                }
 1)               |                alloc_chrdev_region() {
 1)               |                  __register_chrdev_region() {
 1)   0.155 us    |                    kmem_cache_alloc_trace();
 1)   0.516 us    |                    mutex_lock();
 1)   0.106 us    |                    mutex_unlock();
 1)   2.660 us    |                  }
 1)   3.327 us    |                }
 1)               |                printk() {
 1)               |                  vprintk_default() {
 1)   2.841 us    |                    vprintk_emit();
 1)   3.427 us    |                  }
 1)   4.053 us    |                }
 1)               |                __class_create() {
 1)               |                  kmem_cache_alloc_trace() {
 1)   0.090 us    |                    _cond_resched();
 1)   0.902 us    |                  }
 1)               |                  __class_register() {
 1)   1.348 us    |                    kmem_cache_alloc_trace();
 1)   0.091 us    |                    __mutex_init();
 1)   0.216 us    |                    __kmalloc_track_caller();
 1)   0.101 us    |                    kfree();
 1)   0.110 us    |                    _raw_spin_lock();
 1)   0.080 us    |                    _raw_spin_unlock();
 1)   3.927 us    |                    sysfs_create_dir_ns();
 1)   0.110 us    |                    kernfs_get();
 1)   0.090 us    |                    class_child_ns_type();
 1)   0.586 us    |                    kmem_cache_alloc_trace();
 1) + 30.034 us   |                  }
 1) + 32.197 us   |                }
 1)               |                printk() {
 1)               |                  vprintk_default() {
 1)   2.680 us    |                    vprintk_emit();
 1)   3.266 us    |                  }
 1)   3.858 us    |                }
 1)               |                device_create() {
 1)               |                  device_create_groups_vargs() {
 1)   0.542 us    |                    kmem_cache_alloc_trace();
 1)   0.861 us    |                    device_initialize();
 1)   0.206 us    |                    __kmalloc_track_caller();
 1)   0.100 us    |                    kfree();
 1) # 2434.891 us |                    device_add();
 1) # 2443.262 us |                  }
 1) # 2443.979 us |                }
 1)               |                printk() {
 1)               |                  vprintk_default() {
 1)   5.891 us    |                    vprintk_emit();
 1)   6.703 us    |                  }
 1)   7.410 us    |                }
 1)               |                kmem_cache_alloc_trace() {
 1)   0.090 us    |                  _cond_resched();
 1)   0.896 us    |                }
 1)   0.255 us    |                cdev_init();
 1)               |                cdev_add() {
 1)               |                  kobj_map() {
 1)   0.240 us    |                    __kmalloc();
 1)   0.400 us    |                    mutex_lock();
 1)   0.100 us    |                    mutex_unlock();
 1)   2.685 us    |                  }
 1)   3.402 us    |                }
 1) # 2511.166 us |              }
 1) # 3992.051 us |  }

Wednesday, 9 December 2015

Linux Interrupt handling for x86 systems


Lets see how the interrupts are handled in x86 system.
For this depiction I have used Linux 3.15 kernel source.

The do_IRQ function is called from the arch/x86/kernel/entry_64.S
...
common_interrupt:
XCPT_FRAME
ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
...

do_IRQ implementation
__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);

/* high bit used in ret_from_ code  */
unsigned vector = ~regs->orig_ax;
unsigned irq;

irq_enter();
exit_idle();

irq = __this_cpu_read(vector_irq[vector]);

if (!handle_irq(irq, regs)) {
ack_APIC_irq();    ---- End of Interrupt

if (irq != VECTOR_RETRIGGERED) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
    __func__, smp_processor_id(),
    vector, irq);
} else {
__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
}
}

irq_exit();

set_irq_regs(old_regs);
return 1;
}

The do_IRQ implementation first enters into the interrupt context using function irq_enter(). (arch/x86/kernel/irq.c)
The function irq_enter does many stuffs with timers and updates the jiffies.
The function also increments the preempt count : preempt_count_add(HARDIRQ_OFFSET); in __irq_enter()

Then it calls exit_idle to end the idle loop.

Taking the irq number 
Then the do_IRQ function reads the IRQ number from this CPU's vector array.
unsigned vector = ~regs->orig_ax;

irq = __this_cpu_read(vector_irq[vector]);

Now things are set to execute the interrupt handler. It is done in handle_irq function.
bool handle_irq(unsigned irq, struct pt_regs *regs)
{
struct irq_desc *desc;

stack_overflow_check(regs);

desc = irq_to_desc(irq);
if (unlikely(!desc))
return false;

generic_handle_irq_desc(irq, desc);
return true;
}

This function just fetches the interrupt descriptor and calls generic_handle_irq_desc.

static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
{
desc->handle_irq(irq, desc);
}

This function calls handle_irq for this descriptor:
desc->handle_irq(irq, desc);

The handle_irq would have been set by irq_set_chip_and_handler_name at the time of initialization.
Generally this will point to handle_edge_irq or handle_level_irq. These functions are set depending upon if the interrupt getting handled is level triggered or edge.

Difference between Edge triggered and Level Triggered interrupt handling 

The edge triggered interrupts need not be masked when it is getting handled, whereas the level triggered interrupts need to be masked.

handle_edge_irq
void
handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);

desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
/*
* If we're currently running this IRQ, or its disabled,
* we shouldn't process the IRQ. Mark it pending, handle
* the necessary masking and go out
*/
if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
    irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
if (!irq_check_poll(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
goto out_unlock;
}
}
kstat_incr_irqs_this_cpu(irq, desc);

/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);

do {
if (unlikely(!desc->action)) {
mask_irq(desc);
goto out_unlock;
}

/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
* Renable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
   irqd_irq_masked(&desc->irq_data))
unmask_irq(desc);
}

handle_irq_event(desc);

} while ((desc->istate & IRQS_PENDING) &&
!irqd_irq_disabled(&desc->irq_data));

out_unlock:
raw_spin_unlock(&desc->lock);
}

EXPORT_SYMBOL(handle_edge_irq);

[BOOKMARK]As, the edge triggered interrupt are not masked they can appear on another CPU even if they are getting handled on one CPU.
The first part of function handle_edge_irq checks if it is running on another CPU then it masks the interrupt and sends and acknowledgment for it.

if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
    irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
if (!irq_check_poll(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
goto out_unlock;
}
}

Next  handle_edge_irq sends an ack to interrupt controller.
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);


Next there is a loop to handle all the interrupt requests which come for this irq.

do {
if (unlikely(!desc->action)) {
mask_irq(desc);
goto out_unlock;
}

/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
* Renable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
   irqd_irq_masked(&desc->irq_data))
unmask_irq(desc);
}

handle_irq_event(desc);

} while ((desc->istate & IRQS_PENDING) &&
!irqd_irq_disabled(&desc->irq_data));


Here it checks for another pending interrupt(which might have come on another CPU) and executes it. See [BOOKMARK]


handle_level_irq :
void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);

if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
if (!irq_check_poll(desc))
goto out_unlock;

desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);

/*
* If its disabled or no action available
* keep it masked and get out of here
*/
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
goto out_unlock;
}

handle_irq_event(desc);

cond_unmask_irq(desc);

out_unlock:
raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);

As this function has to run with all the interrupts masked. There are less race conditions than handle_edge_irq
This function just calls mask_ack_irq(desc);

handle_irq_event : This function runs all the handlers registered with this interrupt

do {
irqreturn_t res;

trace_irq_handler_entry(irq, action);
res = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, res);

if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
     irq, action->handler))
local_irq_disable();

switch (res) {
case IRQ_WAKE_THREAD:
/*
* Catch drivers which return WAKE_THREAD but
* did not set up a thread function
*/
if (unlikely(!action->thread_fn)) {
warn_no_thread(irq, action);
break;
}

__irq_wake_thread(desc, action);

/* Fall through to add to randomness */
case IRQ_HANDLED:
flags |= action->flags;
break;

default:
break;
}

retval |= res;
action = action->next;
} while (action);


End of Interrupt : The APIC on x86 systems has to send the #EOI signal on the PCI bus to tell that interrupt processing is finished. This is done just after the irq is handled in do_IRQ function.

do_IRQ()
...
if (!handle_irq(irq, regs)) {
ack_APIC_irq();
...



Following ftrace graph on do_IRQ function will tell you the sequence of events :
1)   ==========> |
 1)               |  do_IRQ() {
 1)               |    irq_enter() {
 1)               |      rcu_irq_enter() {
 1)   0.201 us    |        rcu_eqs_exit_common.isra.45();
 1)   1.223 us    |      }
 1)               |      tick_irq_enter() {
 1)   0.190 us    |        tick_check_oneshot_broadcast_this_cpu();
 1)               |        ktime_get() {
 1)   1.192 us    |          read_hpet();
 1)   2.174 us    |        }
 1)               |        update_ts_time_stats() {
 1)   0.175 us    |          nr_iowait_cpu();
 1)   1.217 us    |        }
 1)               |        ktime_get() {
 1)   1.057 us    |          read_hpet();
 1)   2.019 us    |        }
 1)   0.135 us    |        touch_softlockup_watchdog();
 1)               |        tick_do_update_jiffies64() {
 1)   0.185 us    |          _raw_spin_lock();
 1)               |          do_timer() {
 1)   0.140 us    |            calc_global_load();
 1)   1.162 us    |          }
 1)   0.135 us    |          _raw_spin_unlock();
 1)               |          update_wall_time() {
 1)   0.200 us    |            _raw_spin_lock_irqsave();
 1)   0.967 us    |            read_hpet();
 1)   0.140 us    |            ntp_tick_length();
 1)   0.136 us    |            ntp_tick_length();
 1)   0.136 us    |            ntp_tick_length();
 1)               |            timekeeping_update.constprop.8() {
 1)   0.290 us    |              update_vsyscall();
 1)               |              raw_notifier_call_chain() {
 1)   0.246 us    |                notifier_call_chain();
 1)   1.193 us    |              }
 1)   3.526 us    |            }
 1)   0.191 us    |            _raw_spin_unlock_irqrestore();
 1) + 11.778 us   |          }
 1) + 16.802 us   |        }
 1)   0.135 us    |        touch_softlockup_watchdog();
 1) + 29.011 us   |      }
 1)               |      _local_bh_enable() {
 1)   0.145 us    |        __local_bh_enable();
 1)   1.107 us    |      }
 1) + 34.723 us   |    }
 1)               |    exit_idle() {
 1)               |      atomic_notifier_call_chain() {
 1)   0.145 us    |        notifier_call_chain();
 1)   1.107 us    |      }
 1)   2.069 us    |    }
 1)               |    handle_irq() {
 1)   0.225 us    |      irq_to_desc();
 1)               |      handle_edge_irq() {
 1)   0.175 us    |        _raw_spin_lock();
 1)   0.180 us    |        irq_may_run();
 1)               |        apic_ack_edge() {
 1)   0.140 us    |          irq_complete_move();
 1)   0.140 us    |          irq_move_irq();
 1)   2.069 us    |        }
 1)               |        handle_irq_event() {
 1)   0.140 us    |          _raw_spin_unlock();
 1)               |          handle_irq_event_percpu() {
 1)               |            i8042_interrupt() {
 1)   0.231 us    |              _raw_spin_lock_irqsave();
 1)   0.155 us    |              _raw_spin_unlock_irqrestore();
 1)               |              serio_interrupt() {
 1)   0.205 us    |                _raw_spin_lock_irqsave();
 1)               |                psmouse_interrupt [psmouse]() {
 1)               |                  psmouse_handle_byte [psmouse]() {
 1)   0.315 us    |                    synaptics_process_byte [psmouse]();
 1)   1.303 us    |                  }
 1)   2.390 us    |                }
 1)   0.210 us    |                _raw_spin_unlock_irqrestore();
 1)   5.816 us    |              }
 1) + 14.528 us   |            }
 1)   0.340 us    |            add_interrupt_randomness();
 1)   0.196 us    |            note_interrupt();
 1) + 18.517 us   |          }
 1)   0.196 us    |          _raw_spin_lock();
 1) + 21.462 us   |        }
 1)   0.150 us    |        _raw_spin_unlock();
 1) + 28.525 us   |      }
 1) + 30.554 us   |    }
 1)               |    irq_exit() {
 1)   0.165 us    |      idle_cpu();
 1)               |      tick_nohz_irq_exit() {
 1)               |        __tick_nohz_idle_enter() {
 1)               |          ktime_get() {
 1)   1.298 us    |            read_hpet();
 1)   2.340 us    |          }
 1)   0.151 us    |          timekeeping_max_deferment();
 1)               |          get_next_timer_interrupt() {
 1)   0.206 us    |            _raw_spin_lock();
 1)   0.151 us    |            _raw_spin_unlock();
 1)               |            hrtimer_get_next_event() {
 1)   0.191 us    |              _raw_spin_lock_irqsave();
 1)   0.151 us    |              _raw_spin_unlock_irqrestore();
 1)   2.109 us    |            }
 1)   5.100 us    |          }
 1)               |          hrtimer_start() {
 1)               |            __hrtimer_start_range_ns() {
 1)               |              lock_hrtimer_base.isra.22() {
 1)   0.180 us    |                _raw_spin_lock_irqsave();
 1)   1.147 us    |              }
 1)   0.246 us    |              __remove_hrtimer();
 1)   0.135 us    |              get_nohz_timer_target();
 1)   0.321 us    |              enqueue_hrtimer();
 1)   0.156 us    |              _raw_spin_unlock_irqrestore();
 1)   6.373 us    |            }
 1)   7.329 us    |          }
 1) + 18.691 us   |        }
 1) + 19.658 us   |      }
 1)               |      rcu_irq_exit() {
 1)   0.225 us    |        rcu_eqs_enter_common.isra.44();
 1)   1.277 us    |      }
 1) + 23.781 us   |    }
 1) + 94.920 us   |  }
 1)   <========== |

Wednesday, 6 May 2015

Linux Signals - Internals

Signal User Space C Program 

Lets start with writing a simple signal user space C program : 

#include<signal.h>
#include<stdio.h>

/* Handler function */
void handler(int sig) {
        printf("Receive signal: %u\n", sig);
};

int main(void) {
        struct sigaction sig_a;

        /* Initialize the signal handler structure */
        sig_a.sa_handler = handler;
        sigemptyset(&sig_a.sa_mask);
        sig_a.sa_flags = 0;

        /* Assign a new handler function to the SIGINT signal */
        sigaction(SIGINT, &sig_a, NULL);

        /* Block and wait until a signal arrives */
        while (1) {
                sigsuspend(&sig_a.sa_mask);
                printf("loop\n");
        }
        return 0;
};

This code assigns a new handler for SIGINT signal. SIGINT can be sent to the running process using Ctrl+C key combination. When Ctrl+C is pressed then the asynchronous signal SIGINT is sent to the task. It is also equivalent to sending the kill -2 <pid> command in other terminal.

If you do a kill -l you will come to know the various signals which can be sent to a running process. 

[root@linux ~]# kill -l
 1) SIGHUP 2) SIGINT 3) SIGQUIT 4) SIGILL 5) SIGTRAP
 6) SIGABRT 7) SIGBUS 8) SIGFPE 9) SIGKILL 10) SIGUSR1
11) SIGSEGV 12) SIGUSR2 13) SIGPIPE 14) SIGALRM 15) SIGTERM
16) SIGSTKFLT 17) SIGCHLD 18) SIGCONT 19) SIGSTOP 20) SIGTSTP
21) SIGTTIN 22) SIGTTOU 23) SIGURG 24) SIGXCPU 25) SIGXFSZ
26) SIGVTALRM 27) SIGPROF 28) SIGWINCH 29) SIGIO 30) SIGPWR
31) SIGSYS 34) SIGRTMIN 35) SIGRTMIN+1 36) SIGRTMIN+2 37) SIGRTMIN+3
38) SIGRTMIN+4 39) SIGRTMIN+5 40) SIGRTMIN+6 41) SIGRTMIN+7 42) SIGRTMIN+8
43) SIGRTMIN+9 44) SIGRTMIN+10 45) SIGRTMIN+11 46) SIGRTMIN+12 47) SIGRTMIN+13
48) SIGRTMIN+14 49) SIGRTMIN+15 50) SIGRTMAX-14 51) SIGRTMAX-13 52) SIGRTMAX-12
53) SIGRTMAX-11 54) SIGRTMAX-10 55) SIGRTMAX-9 56) SIGRTMAX-8 57) SIGRTMAX-7
58) SIGRTMAX-6 59) SIGRTMAX-5 60) SIGRTMAX-4 61) SIGRTMAX-3 62) SIGRTMAX-2
63) SIGRTMAX-1 64) SIGRTMAX

Also following key combination can be used to send particular signals :
CTRL-C - sends SIGINT which default action is to terminate the application.
CTRL-\ - sends SIGQUIT which default action is to terminate the application dumping core.
CTRL-Z - sends SIGSTOP that suspends the program.

If you compile and run the above C program then you will get the following O/P
[root@linux signal]# ./a.out 
Receive signal: 2
loop
Receive signal: 2
loop
^CReceive signal: 2
loop


Even with Ctrl+C or kill -2 <pid> the process will not terminate. Instead it will execute the signal handler and return.



How the signal is sent to process


If we see the internals of the signal sending to a process and put Jprobe with dump_stack at __send_signal function we will see following call trace : 

May  5 16:18:37 linux kernel: [<ffffffff815e19ba>] dump_stack+0x19/0x1b
May  5 16:18:37 linux kernel: [<ffffffffa08e3029>] my_handler+0x29/0x30 [probe]
May  5 16:18:37 linux kernel: [<ffffffff81071a75>] complete_signal+0x205/0x250
May  5 16:18:37 linux kernel: [<ffffffff81071f54>] __send_signal+0x194/0x4b0
May  5 16:18:37 linux kernel: [<ffffffff810722ae>] send_signal+0x3e/0x80
May  5 16:18:37 linux kernel: [<ffffffff81072db2>] do_send_sig_info+0x52/0xa0
May  5 16:18:37 linux kernel: [<ffffffff81073326>] group_send_sig_info+0x46/0x50
May  5 16:18:37 linux kernel: [<ffffffff8107337d>] __kill_pgrp_info+0x4d/0x80
May  5 16:18:37 linux kernel: [<ffffffff810733e5>] kill_pgrp+0x35/0x50
May  5 16:18:37 linux kernel: [<ffffffff81374e8b>] n_tty_receive_char+0x42b/0xe30
May  5 16:18:37 linux kernel: [<ffffffff81106d46>] ? ftrace_ops_list_func+0x106/0x120
May  5 16:18:37 linux kernel: [<ffffffff81375a3c>] n_tty_receive_buf+0x1ac/0x470
May  5 16:18:37 linux kernel: [<ffffffff81378dc9>] flush_to_ldisc+0x109/0x160
May  5 16:18:37 linux kernel: [<ffffffff8107e02b>] process_one_work+0x17b/0x460
May  5 16:18:37 linux kernel: [<ffffffff8107edfb>] worker_thread+0x11b/0x400
May  5 16:18:37 linux kernel: [<ffffffff8107ece0>] ? rescuer_thread+0x400/0x400
May  5 16:18:37 linux kernel: [<ffffffff81085aef>] kthread+0xcf/0xe0
May  5 16:18:37 linux kernel: [<ffffffff81085a20>] ? kthread_create_on_node+0x140/0x140
May  5 16:18:37 linux kernel: [<ffffffff815f206c>] ret_from_fork+0x7c/0xb0
May  5 16:18:37 linux kernel: [<ffffffff81085a20>] ? kthread_create_on_node+0x140/0x140

So the major function calls for sending the signal goes like :
First shell send the Ctrl+C signal using n_tty_receive_char
n_tty_receive_char()
isig()
kill_pgrp()
__kill_pgrp_info()
group_send_sig_info() -- for each PID in group call this function 
do_send_sig_info()
send_signal()
__send_signal() -- allocates a signal structure and add to task pending signals
complete_signal()
signal_wake_up()
signal_wake_up_state()  -- sets TIF_SIGPENDING in the task_struct flags. Then it wake up the thread to which signal was delivered.


Now everything is set up and necessary changes are done to the task_struct of the process. 



Handling of signal

The signal is checked/ handled by a process when it returns from system call or if the return from interrupt is done. The return from the system call is present in file entry_64.S.
The function int_signal function is called from entry_64.S which calls the function do_notify_resume()

Lets check the function do_notify_resume() This function checks if we have the TIF_SIGPENDING flag set in the task_struct :
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
do_signal calls handle_signal to call the signal specific handler
Signals are actually run in user mode in function :
__setup_rt_frame -- this sets up the instruction pointer to handler : regs->ip = (unsigned long) ksig->ka.sa.sa_handler;

SYSTEM calls and signals

“Slow” syscalls e.g. blocking read/write, put processes into waiting state: TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE.
A task in state TASK_INTERRUPTIBLE will be changed to the TASK_RUNNING state by a signal. TASK_RUNNING means a process can be scheduled.
If executed, its signal handler will be run before completion of “slow” syscall. The syscall does not complete by default. 
If SA_RESTART flag set, syscall is restarted after signal handler finishes.

Tuesday, 21 April 2015

Linux spinlock implementation on x86_64 (ticketing and lock prefix)


Lets look at spin_lock implementation in kernel 3.10.0 for x86_64 machine.

In include/linux/spinlock.h we have the definition of spin_lock

static inline void spin_lock(spinlock_t *lock)
{
raw_spin_lock(&lock->rlock);
}
#define raw_spin_lock(lock) _raw_spin_lock(lock)

kernel/spinlock.c : 136
void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
__raw_spin_lock(lock);
}
EXPORT_SYMBOL(_raw_spin_lock);

Now the implementation goes to arch/x86/include/asm/spinlock.h: 87
static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
{
register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };

inc = xadd(&lock->tickets, inc);
if (likely(inc.head == inc.tail))
goto out;

inc.tail &= ~TICKET_SLOWPATH_FLAG;
for (;;) {
unsigned count = SPIN_THRESHOLD;

do {
if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
goto out;
cpu_relax();
} while (--count);
__ticket_lock_spinning(lock, inc.tail);
}
out: barrier(); /* make sure nothing creeps before the lock is taken */
}

If we see the implementation of arch_spin_lock we come to know that ticketing spin locks are used to do the fair sharing of the lock among various CPUs.
More about the ticket lock can be found here : http://en.wikipedia.org/wiki/Ticket_lock
The pseudo code for Lock and Unlock can be seen as :
 record locktype {
    int ticketnumber
    int turn
 }
 procedure LockInit( locktype* lock ) {
    lock.ticketnumber := 0
    lock.turn := 0
 }
 procedure Lock( locktype* lock ) {
    int myturn := FetchAndIncrement( &lock.ticketnumber )
    while lock.turn ≠ myturn
        skip // spin until lock is acquired
 }
 procedure UnLock( locktype* lock ) {
    FetchAndIncrement( &lock.turn )
 }




























Also we see the implementation of xadd function :
xadd adds "inc" to "*ptr" and atomically returns the previous value of *ptr
we see the "lock" prefix used in the xadd. This lock prefix provides synchronization if multiple CPU's wants to acquire lock at same time.

More details about the lock prefix :
http://www.codemaestro.com/reviews/8
All X86 CPUs are equipped with the ability to lock a specific memory address, preventing other system buses to read or modify it while the following instruction runs. The LOCK prefix to an assembly instruction causes the CPUs to assert the LOCK# signal, and practically ensures exclusive use of the memory address in multiprocessors / multi-thread environments.

If we disassemble the function _raw_spin_lock we see the lock prefix used before xadd :

kernel/spinlock.c: 136
0xffffffff815e90b0 <_raw_spin_lock>:    data32 data32 data32 xchg %ax,%ax
0xffffffff815e90b5 <_raw_spin_lock+5>:  push   %rbp
0xffffffff815e90b6 <_raw_spin_lock+6>:  mov    %rsp,%rbp
arch/x86/include/asm/spinlock.h: 87
0xffffffff815e90b9 <_raw_spin_lock+9>:  mov    $0x20000,%eax
0xffffffff815e90be <_raw_spin_lock+14>: lock xadd %eax,(%rdi)
0xffffffff815e90c2 <_raw_spin_lock+18>: mov    %eax,%edx
0xffffffff815e90c4 <_raw_spin_lock+20>: shr    $0x10,%edx
arch/x86/include/asm/spinlock.h: 88
0xffffffff815e90c7 <_raw_spin_lock+23>: cmp    %ax,%dx
0xffffffff815e90ca <_raw_spin_lock+26>: jne    0xffffffff815e90ce <_raw_spin_lock+30>
kernel/spinlock.c: 138
0xffffffff815e90cc <_raw_spin_lock+28>: pop    %rbp
0xffffffff815e90cd <_raw_spin_lock+29>: retq  
/usr/src/debug/kernel-3.10.0-123.el7/linux-3.10.0-123.el7.x86_64/arch/x86/include/asm/spinlock.h: 91
0xffffffff815e90ce <_raw_spin_lock+30>: and    $0xfffffffe,%edx
arch/x86/include/asm/paravirt.h: 718
0xffffffff815e90d1 <_raw_spin_lock+33>: movzwl %dx,%esi
kernel/spinlock.c: 136
0xffffffff815e90d4 <_raw_spin_lock+36>: mov    $0x8000,%eax
0xffffffff815e90d9 <_raw_spin_lock+41>: jmp    0xffffffff815e90e7 <_raw_spin_lock+55>
0xffffffff815e90db <_raw_spin_lock+43>: nopl   0x0(%rax,%rax,1)
arch/x86/include/asm/processor.h: 661
0xffffffff815e90e0 <_raw_spin_lock+48>: pause
arch/x86/include/asm/spinlock.h: 99
0xffffffff815e90e2 <_raw_spin_lock+50>: sub    $0x1,%eax
0xffffffff815e90e5 <_raw_spin_lock+53>: je     0xffffffff815e90f1 <_raw_spin_lock+65>
arch/x86/include/asm/spinlock.h: 96
0xffffffff815e90e7 <_raw_spin_lock+55>: movzwl (%rdi),%ecx
0xffffffff815e90ea <_raw_spin_lock+58>: cmp    %cx,%dx
0xffffffff815e90ed <_raw_spin_lock+61>: jne    0xffffffff815e90e0 <_raw_spin_lock+48>
kernel/spinlock.c: 138
0xffffffff815e90ef <_raw_spin_lock+63>: pop    %rbp
0xffffffff815e90f0 <_raw_spin_lock+64>: retq  
arch/x86/include/asm/paravirt.h: 718
0xffffffff815e90f1 <_raw_spin_lock+65>: data32 data32 xchg %ax,%ax
0xffffffff815e90f5 <_raw_spin_lock+69>: data32 xchg %ax,%ax
0xffffffff815e90f8 <_raw_spin_lock+72>: jmp    0xffffffff815e90d4 <_raw_spin_lock+36>
0xffffffff815e90fa <_raw_spin_lock+74>: nopw   0x0(%rax,%rax,1)


Tuesday, 31 March 2015

Inside Linux cgroups for blkio subsystem

cgroups enable us to distribute the resources among the various tasks or tasks group. The cgroup uses subsystems (resources like cpu, mem, blkio) to apply per-cgroup limits for these resources. Refer [1] [2].

Following steps are required for creating a cgroup with only specialized limiting of blkio subsystem.

create blkio cgroup :
                        mount -t tmpfs cgroup_root /sys/fs/cgroup
                        mkdir /sys/fs/cgroup/blkio
                        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
                        mkdir -p /sys/fs/cgroup/blkio/test1/                  ---------------> creation of cgroup test1
                        mkdir -p /sys/fs/cgroup/blkio/test2/                  ---------------> creation of cgroup test2

                        echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight  -----> Set weight of cgroup test1
                        echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight  ----> Set weight of cgroup test2

                        sync
                        echo 3 > /proc/sys/vm/drop_caches
                       
                        dd if=/dev/sdbv of=file_1 bs=1M count=512 &
                        echo $! > /sys/fs/cgroup/blkio/test1/tasks   ---> Attach dd process to test1 cgroup
                        cat /sys/fs/cgroup/blkio/test1/tasks

                        dd if=/dev/sdbv of=file_2 bs=1M count=512 &
                        echo $! > /sys/fs/cgroup/blkio/test2/tasks   --> Attach dd process to test2 cgroup
                        cat /sys/fs/cgroup/blkio/test2/tasks

Here we create a cgroup with blkio subsystem, assign weights and attach “dd” process to these cgroups. "test1” cgroup will complete the io faster than the “test2” cgroup as less weight is assigned to “test2”

Peek into the changes done in task_struct of the dd process:
We added a jprobe in the generic_make_request function and tried to print the cgroup and the subsystem the “dd” process is attached to.

Here is the probe function code:
void my_handler (struct bio *bio)
{
    struct task_struct *task = current;
    char *str = "dd";
    int i = 0 ;
    if (strncmp(str,task->comm,2) == 0)
    {
        printk("assignment: current process: %s, PID: %d\n", task->comm, task->pid);
        for (i=0;i<CGROUP_SUBSYS_COUNT;i++)
        {
            printk("cgroup subsys count = %d\n",i);
            if(task->cgroups->subsys != NULL)
            {
                if(task->cgroups->subsys[i] != NULL)
                {
                    if(task->cgroups->subsys[i]->cgroup != NULL)
                    {
                        if(task->cgroups->subsys[i]->cgroup->name != NULL)
                 printk("cgroup->name  = %s\n", task->cgroups->subsys[i]->cgroup->name->name);
                        if(task->cgroups->subsys[i]->ss != NULL)
                            if (task->cgroups->subsys[i]->ss->name != NULL)
                 printk("cgroup->subsys name  = %s\n", task->cgroups->subsys[i]->ss->name);
                    }
                }
            }
            else
            {
                printk("NULL\n");
            }
        }
    }
jprobe_return();
}


Following is the output we get:

2014-12-02T13:29:32.843643+05:30 lnx kernel: [508988.896860] assignment: current process: dd, PID: 29713
2014-12-02T13:29:32.843644+05:30 lnx kernel: [508988.896861] cgroup subsys count = 0
2014-12-02T13:29:32.843645+05:30 lnx kernel: [508988.896863] cgroup->name  = /
2014-12-02T13:29:32.843653+05:30 lnx kernel: [508988.896865] cgroup->subsys name  = cpuset

2014-12-02T13:29:32.843654+05:30 lnx kernel: [508988.896866] cgroup subsys count = 1
2014-12-02T13:29:32.843656+05:30 lnx kernel: [508988.896868] cgroup->name  = /
2014-12-02T13:29:32.843657+05:30 lnx kernel: [508988.896870] cgroup->subsys name  = cpu

2014-12-02T13:29:32.843658+05:30 lnx kernel: [508988.896871] cgroup subsys count = 2
2014-12-02T13:29:32.843659+05:30 lnx kernel: [508988.896873] cgroup->name  = /
2014-12-02T13:29:32.843660+05:30 lnx kernel: [508988.896874] cgroup->subsys name  = cpuacct

2014-12-02T13:29:32.843662+05:30 lnx kernel: [508988.896876] cgroup subsys count = 3
2014-12-02T13:29:32.843663+05:30 lnx kernel: [508988.896878] cgroup->name  = /
2014-12-02T13:29:32.843665+05:30 lnx kernel: [508988.896879] cgroup->subsys name  = memory

2014-12-02T13:29:32.843666+05:30 lnx kernel: [508988.896881] cgroup subsys count = 4
2014-12-02T13:29:32.843667+05:30 lnx kernel: [508988.896882] cgroup->name  = /
2014-12-02T13:29:32.843668+05:30 lnx kernel: [508988.896884] cgroup->subsys name  = devices

2014-12-02T13:29:32.843669+05:30 lnx kernel: [508988.896886] cgroup subsys count = 5
2014-12-02T13:29:32.843671+05:30 lnx kernel: [508988.896887] cgroup->name  = /
2014-12-02T13:29:32.843672+05:30 lnx kernel: [508988.896888] cgroup->subsys name  = freezer

2014-12-02T13:29:32.843681+05:30 lnx kernel: [508988.896890] cgroup subsys count = 6
2014-12-02T13:29:32.843682+05:30 lnx kernel: [508988.896891] cgroup->name  = test1
2014-12-02T13:29:32.843683+05:30 lnx kernel: [508988.896893] cgroup->subsys name  = blkio

2014-12-02T13:29:32.843685+05:30 lnx kernel: [508988.896894] cgroup subsys count = 7
2014-12-02T13:29:32.843686+05:30 lnx kernel: [508988.896896] cgroup->name  = /
2014-12-02T13:29:32.843688+05:30 lnx kernel: [508988.896897] cgroup->subsys name  = perf_event


2014-12-02T13:29:32.843689+05:30 lnx kernel: [508988.896898] cgroup subsys count = 8
2014-12-02T13:29:32.843690+05:30 lnx kernel: [508988.896901] cgroup->name  = /
2014-12-02T13:29:32.843692+05:30 lnx kernel: [508988.896902] cgroup->subsys name  = hugetlb


2014-12-02T13:29:32.843693+05:30 lnx kernel: [508988.896902] cgroup subsys count = 9
2014-12-02T13:29:32.843694+05:30 lnx kernel: [508988.896903] cgroup subsys count = 10

From this example we see that for the dd process all the susbsystems(resources) are using the default root (”/”) cgroup. The blkio subsys uses the test1 cgroup.

Now further we will see that how the cgroup initialization is done and the code corresponding to various steps used.


Linux cgroups initialization at boot up:

A new file system of type "cgroup" (VFS) is registered on Linux start.
started like :
start_kernel -> cgroup_init_early -> cgroup_init_subsys -> cgroup_init


cgroup_init_subsys
top cgroup state is created :
                        /* Create the top cgroup state for this subsystem */
                        list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);

cgroupfs_root is created.

Filesystem registration :
mounting unmounting operations : cgroup_init()
                        err = register_filesystem(&cgroup_fs_type);
                       
static struct file_system_type cgroup_fs_type = {
                        .name = "cgroup",
                        .mount = cgroup_mount,
                        .kill_sb = cgroup_kill_sb,
};


CGROUP ACTIONS :
All cgroups actions are performed via filesystem actions (create/remove directory, reading/writing to files in it, mounting/mount options).            

mount operations are mentioned previously. The read, write , create and remove are identified by :
kernel/cgroup.c :

static const struct file_operations cgroup_file_operations = {
                        .read = cgroup_file_read,
                        .write = cgroup_file_write,
                        .llseek = generic_file_llseek,
                        .open = cgroup_file_open,
                        .release = cgroup_file_release,
};

static const struct inode_operations cgroup_file_inode_operations = {
                        .setxattr = cgroup_setxattr,
                        .getxattr = cgroup_getxattr,
                        .listxattr = cgroup_listxattr,
                        .removexattr = cgroup_removexattr,
};

static const struct inode_operations cgroup_dir_inode_operations = {
                        .lookup = simple_lookup,
                        .mkdir = cgroup_mkdir,
                        .rmdir = cgroup_rmdir,
                        .rename = cgroup_rename,
                        .setxattr = cgroup_setxattr,
                        .getxattr = cgroup_getxattr,
                        .listxattr = cgroup_listxattr,
                        .removexattr = cgroup_removexattr,
};


The control group can be mounted anywhere on the filesystem. Systemd uses /sys/fs/cgroup. When mounting, we can specify with mount options (-o) which subsystems we want to use.
Say for make a cgroup with blkio susbsystem commands will be :
                        mount -t tmpfs cgroup_root /sys/fs/cgroup
                        mkdir /sys/fs/cgroup/blkio
                        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
                        mkdir -p /sys/fs/cgroup/blkio/test1/

mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
This command calls a cgroup_mount and creates the files in this directory :

lnx:/sys/fs/cgroup/blkio # ls
blkio.io_merged                   blkio.io_service_time_recursive  blkio.reset_stats                blkio.throttle.write_bps_device   cgroup.event_control
blkio.io_merged_recursive         blkio.io_serviced                blkio.sectors                    blkio.throttle.write_iops_device  cgroup.procs
blkio.io_queued                   blkio.io_serviced_recursive      blkio.sectors_recursive          blkio.time                        cgroup.sane_behavior
blkio.io_queued_recursive         blkio.io_wait_time               blkio.throttle.io_service_bytes  blkio.time_recursive              notify_on_release
blkio.io_service_bytes            blkio.io_wait_time_recursive     blkio.throttle.io_serviced       blkio.weight                      release_agent
blkio.io_service_bytes_recursive  blkio.leaf_weight                blkio.throttle.read_bps_device   blkio.weight_device               tasks
blkio.io_service_time             blkio.leaf_weight_device         blkio.throttle.read_iops_device  cgroup.clone_children

Now make directory in this newly created cgroup.
mkdir -p /sys/fs/cgroup/blkio/test1

cgroup_create is called. Here the new cgroup is created and the blkio subsystem is initialised :
                        /* allocate the cgroup and its ID, 0 is reserved for the root */
                        cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
                        if (!cgrp)
                                                return -ENOMEM;

                        name = cgroup_alloc_name(dentry);
                       
cgroup blkio subsystem is allocated              :
                                                css = ss->css_alloc(cgroup_css(parent, ss));

                                               
For the (blkio controller) blkcg the function called is blkcg_css_alloc.

In this function blkcg is initialised :

                        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
                        if (!blkcg)
                                                return ERR_PTR(-ENOMEM);

                        blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
                        blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
                        blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
                        spin_lock_init(&blkcg->lock);
                        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
                        INIT_HLIST_HEAD(&blkcg->blkg_list);
                       
                       
struct blkcg {
                        struct cgroup_subsys_state         css;
                        spinlock_t                                    lock;
                        struct radix_tree_root                  blkg_tree;
                        struct blkcg_gq                          *blkg_hint;
                        struct hlist_head                          blkg_list;
                         /* for policies to test whether associated blkcg has changed */
                        uint64_t                                       id;
                         /* TODO: per-policy storage in blkcg */
                        unsigned int                                cfq_weight;                     /* belongs to cfq */
                        unsigned int                                cfq_leaf_weight;
};
                                               
init_css is called to initialise  cgroup_subsys_state  from the blkio subsystem and cgroup.
                                                init_css(css, ss, cgrp);

2014-12-08T10:28:48.196121+05:30 lnx kernel: [243681.125109] //init_css Handler hit
2014-12-08T10:28:48.196140+05:30 lnx kernel: [243681.125115] cgrp name = test3
2014-12-08T10:28:48.196144+05:30 lnx kernel: [243681.125117] ss name = blkio

jprobe from css_init

2014-12-08T10:28:48.196175+05:30 lnx kernel: [243681.125235]  [<ffffffff810d7d69>] cgroup_mkdir+0x299/0x670
2014-12-08T10:28:48.196177+05:30 lnx kernel: [243681.125246]  [<ffffffff811a9d50>] vfs_mkdir+0xb0/0x160
2014-12-08T10:28:48.196179+05:30 lnx kernel: [243681.125254]  [<ffffffff811af28b>] SyS_mkdirat+0xab/0xe0
2014-12-08T10:28:48.196181+05:30 lnx kernel: [243681.125265]  [<ffffffff81519329>] system_call_fastpath+0x16/0x1b                              

This also generates the directory structure for the subsystem using function calls of cgroup_addrm_files, cgroup_populate_dir.

dump_stack example via jprobe :
2014-12-08T09:59:57.364352+05:30 lnx kernel: [241951.139904]  [<ffffffff810d6909>] cgroup_populate_dir+0x69/0x110
2014-12-08T09:59:57.364354+05:30 lnx kernel: [241951.139909]  [<ffffffff810d80ad>] cgroup_mkdir+0x5dd/0x670
2014-12-08T09:59:57.364356+05:30 lnx kernel: [241951.139914]  [<ffffffff811a9d50>] vfs_mkdir+0xb0/0x160
2014-12-08T09:59:57.364357+05:30 lnx kernel: [241951.139919]  [<ffffffff811af28b>] SyS_mkdirat+0xab/0xe0


Changing the cgroup policies/properties:

The cgroup properties can be changed by writuing to the files in /sys/fs/cgroup/blkio/<cgroup_name>/property.

Example :
                        echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight

This will call write function of cgroup :
cgroup_file_write
                        if (cft->write)
                                                return cft->write(css, cft, file, buf, nbytes, ppos);

                                                this will call the weight cftype's write function
                        {
                                                .name = "weight",
                                                .flags = CFTYPE_NOT_ON_ROOT,
                                                .read_seq_string = cfq_print_weight,
                                                .write_u64 = cfq_set_weight,
                        },

                        And in function __cfq_set_weight the value is set to the blkcg
                                                                        blkcg->cfq_leaf_weight = val;

                                                                      
Attaching a task to the cgroup:
echo <PID> > /sys/fs/cgroup/blkio/test1/tasks
                       
This will fetch function from:
                         */
                        {
                                                .name = "tasks",
                                                .flags = CFTYPE_INSANE,            /* use "procs" instead */
                                                .open = cgroup_tasks_open,
                                                .write_u64 = cgroup_tasks_write,
                                                .release = cgroup_pidlist_release,
                                                .mode = S_IRUGO | S_IWUSR,
                        },
                        {                      
                       
cgroup_tasks_write calls attach_task_by_pid
  attach_task_by_pid in file cgroup.c
                        ret = cgroup_attach_task(cgrp, tsk, threadgroup);
                       
                       
A new css_set is created and attached to the task_struct of this process:
                                                cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);


 Association of request_queue and the block cgroup
Whenever the I/O comes to a block layer the association is created between the devices request queue and the block group.

This association “struct blkcg_gq” is created when I/O comes to a device. It is created in function “blkg_create”. A sample dump_stack of creation of association:

2014-12-15T14:02:07.036066+05:30 lnx kernel: [860978.128274] ////blkg_create Handler hit
2014-12-15T14:02:07.036078+05:30 lnx kernel: [860978.128281] CPU: 6 PID: 17627 Comm: dd Tainted: P           OENX 3.12.28-4-default #1
2014-12-15T14:02:07.036083+05:30 lnx kernel: [860978.128286]  ffff8810568495c0 ffffffff8150b1db ffffffff81acd8c0 ffffffffa039f018
2014-12-15T14:02:07.036084+05:30 lnx kernel: [860978.128291]  ffffffff8128a2f5 ffff88103e6a62c0 ffff880855f48078 ffff88103e712880
2014-12-15T14:02:07.036086+05:30 lnx kernel: [860978.128296]  ffff880855f48078 ffffffff812719b8 0000000000000001 ffff881055749808
2014-12-15T14:02:07.036092+05:30 lnx kernel: [860978.128301] Call Trace:
2014-12-15T14:02:07.036094+05:30 lnx kernel: [860978.128314]  [<ffffffff8100467d>] dump_trace+0x7d/0x2d0
2014-12-15T14:02:07.036095+05:30 lnx kernel: [860978.128321]  [<ffffffff81004964>] show_stack_log_lvl+0x94/0x170
2014-12-15T14:02:07.036096+05:30 lnx kernel: [860978.128326]  [<ffffffff81005d91>] show_stack+0x21/0x50
2014-12-15T14:02:07.036098+05:30 lnx kernel: [860978.128332]  [<ffffffff8150b1db>] dump_stack+0x41/0x51
2014-12-15T14:02:07.036099+05:30 lnx kernel: [860978.128337]  [<ffffffffa039f018>] my_handler+0x18/0x20 [probe]
2014-12-15T14:02:07.036100+05:30 lnx kernel: [860978.128347]  [<ffffffff8128a2f5>] blkg_lookup_create+0x45/0xc0
2014-12-15T14:02:07.036102+05:30 lnx kernel: [860978.128352]  [<ffffffff812719b8>] get_request+0x88/0x6f0
2014-12-15T14:02:07.036110+05:30 lnx kernel: [860978.128507]  [<ffffffff8127028f>] __blk_run_queue+0x2f/0x40
2014-12-15T14:02:07.036111+05:30 lnx kernel: [860978.128512]  [<ffffffff81273860>] blk_flush_plug_list+0x1e0/0x240
2014-12-15T14:02:07.036125+05:30 lnx kernel: [860978.128517]  [<ffffffff81273c20>] blk_finish_plug+0x10/0x40
2014-12-15T14:02:07.036127+05:30 lnx kernel: [860978.128522]  [<ffffffff81140f9f>] __do_page_cache_readahead+0x17f/0x1f0
2014-12-15T14:02:07.036128+05:30 lnx kernel: [860978.128528]  [<ffffffff8114115a>] ondemand_readahead+0x14a/0x280
2014-12-15T14:02:07.036130+05:30 lnx kernel: [860978.128534]  [<ffffffff81137129>] generic_file_aio_read+0x459/0x6f0
2014-12-15T14:02:07.036131+05:30 lnx kernel: [860978.128542]  [<ffffffff8119e2cc>] do_sync_read+0x5c/0x90
2014-12-15T14:02:07.036133+05:30 lnx kernel: [860978.128547]  [<ffffffff8119e879>] vfs_read+0x99/0x160
2014-12-15T14:02:07.036134+05:30 lnx kernel: [860978.128552]  [<ffffffff8119f378>] SyS_read+0x48/0xa0
2014-12-15T14:02:07.036136+05:30 lnx kernel: [860978.128557]  [<ffffffff81519329>] system_call_fastpath+0x16/0x1b
2014-12-15T14:02:07.036137+05:30 lnx kernel: [860978.128567]  [<00007fc484734480>] 0x7fc48473447f


In this example the blkcg_gq association is created from get_request function. The newly creates blk cgroup and request queue association is added to request_queue -> blkg_list.
Also the association is also kept in “struct blkcg -> blkg_list


References:
[1] https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt