Wednesday 13 June 2018

Linux kernel crash dump analysis

In this blog we will see how to analyse a deadlock using crash tool in Linux kernel.

We have written a simple kernel module to generate a deadlock:

Program to create a deadlock :
#include<linux/module.h>
#include<linux/version.h>
#include<linux/kernel.h>
#include<linux/semaphore.h>

#include<linux/kthread.h>
#include<linux/sched.h>
#include<linux/delay.h>
#include<linux/slab.h>

struct semaphore sync1;
struct semaphore sync2;
struct task_struct *task1;
struct task_struct *task2;
int shared_var;
int data;

int thread_function_one(void *data)
{
    int ret = 10;
    int cpu1;
    printk(KERN_INFO "IN THREAD FUNCTION 1 \n");

    while(!kthread_should_stop()){
       down(&sync1);
       cpu1 = get_cpu();
       put_cpu();
       printk("t1 cpu = %d shared_var = %d\n",cpu1,shared_var);
       msleep(1000);
       down(&sync2);
       up(&sync2);
       up(&sync1);
    }
    printk(KERN_INFO "EXIT from thread function 1\n");
    return ret;
}

int thread_function_two(void *data)
{
    int ret = 10;
    int cpu2;
    printk(KERN_INFO "IN THREAD FUNCTION 2 \n");

    while(!kthread_should_stop()){
        down(&sync2);
        cpu2 = get_cpu();
        put_cpu();
        printk("t2 cpu = %d shared_var = %d\n",cpu2,shared_var);
        msleep(2000);
        down(&sync1);
        up(&sync1);
        up(&sync2);
    }
    printk(KERN_INFO "EXIT from thread function 2\n");
    return ret;
}

static int kernel_init(void)
{
   int cpu3;
   sema_init(&sync1, 1);
   sema_init(&sync2, 1);
   printk(KERN_INFO "module_init address of sync1 = %p sync2 = %p\n",&sync1, &sync2);

   cpu3 = get_cpu();
   put_cpu();
   printk("main thread cpu = %d \n",cpu3);

   shared_var = 0;
   task1 = kthread_create(&thread_function_one,(void *)&data,"one");
   kthread_bind(task1, cpu3);
   wake_up_process(task1);

   cpu3 = 3;
   task2 = kthread_create(&thread_function_two,(void *)&data,"two");
   kthread_bind(task2, cpu3);
   wake_up_process(task2);

   return 0;
}

static void kernel_exit(void)
{
   kthread_stop(task1);
   kthread_stop(task2);
   printk(KERN_INFO "module_exit\n");
}

module_init(kernel_init);
module_exit(kernel_exit);

MODULE_AUTHOR("K_K");
MODULE_DESCRIPTION("SIMPLE MODULE");
MODULE_LICENSE("GPL");


Here we launch 2 kernel threads and try to take the 2 semaphores:

In thread "one" the semaphores are taken in the order  sync1 --> sync2. Also they are released like sync2 --> sync1
       down(&sync1);
       cpu1 = get_cpu();
...
...
       down(&sync2);
       up(&sync2);
       up(&sync1);

In thread 2 we take the semaphores in opposite order  sync2 --> sync1. Also they are released like sync1 --> sync2
        down(&sync2);
        cpu2 = get_cpu();
...
...
        down(&sync1);
        up(&sync1);
        up(&sync2);

If we launch this program we see that these 2 threads gets entangled in a deadlock. This is a classic case of ABBA deadlock.
(The solution for this is correct lock ordering)

The print from the dmseg shows following :

Dec 12 02:49:25 localhost kernel: device-mapper: multipath: kundan message: error getting device 8:0
Dec 12 02:49:25 localhost kernel: threads: module verification failed: signature and/or required key missing - tainting kernel
Dec 12 02:49:25 localhost kernel: module_init address of sync1 = ffffffffa0536280 sync2 = ffffffffa0536260
Dec 12 02:49:25 localhost kernel: main thread cpu = 1
Dec 12 02:49:25 localhost kernel: IN THREAD FUNCTION 1
Dec 12 02:49:25 localhost kernel: t1 cpu = 1 shared_var = 0
Dec 12 02:49:25 localhost kernel: IN THREAD FUNCTION 2
Dec 12 02:49:25 localhost kernel: t2 cpu = 3 shared_var = 0

Also  ps command shows 2 kernel thread in UNINTERRUPTIBLE SLEEP :
ps :
   3580      2   1  ffff88007b1aa220  UN   0.0       0      0  [one]
   3581      2   3  ffff88007b1ac440  UN   0.0       0      0  [two]

We can get the backtrace of these threads from following command :
echo t > /proc/sysrq-trigger

It generates lot of output in kernel logs. It dumps the stack of all the kernel processes in kernel logs.

For our 2 threads we get following call stack :
Dec 12 03:21:59 localhost kernel: one             D ffff88007fc94580     0  3580      2 0x00000080
Dec 12 03:21:59 localhost kernel: ffff880079573d50 0000000000000046 ffff880079573fd8 0000000000014580
Dec 12 03:21:59 localhost kernel: ffff880079573fd8 0000000000014580 ffff88007b1aa220 ffffffffa0536260
Dec 12 03:21:59 localhost kernel: 7fffffffffffffff ffff88007b1aa220 0000000000000002 0000000000000000
Dec 12 03:21:59 localhost kernel: Call Trace:
Dec 12 03:21:59 localhost kernel: [<ffffffff815f1a39>] schedule+0x29/0x70
Dec 12 03:21:59 localhost kernel: [<ffffffff815ef939>] schedule_timeout+0x209/0x2d0
Dec 12 03:21:59 localhost kernel: [<ffffffff8106e37b>] ? lock_timer_base.isra.35+0x2b/0x50
Dec 12 03:21:59 localhost kernel: [<ffffffff8106f06e>] ? try_to_del_timer_sync+0x5e/0x90
Dec 12 03:21:59 localhost kernel: [<ffffffff815f0f32>] __down_common+0xd2/0x14a
Dec 12 03:21:59 localhost kernel: [<ffffffffa0534000>] ? 0xffffffffa0533fff
Dec 12 03:21:59 localhost kernel: [<ffffffff815f0fc7>] __down+0x1d/0x1f
Dec 12 03:21:59 localhost kernel: [<ffffffff8108b8c1>] down+0x41/0x50
Dec 12 03:21:59 localhost kernel: [<ffffffffa0534073>] thread_function_one+0x73/0x90 [threads]
Dec 12 03:21:59 localhost kernel: [<ffffffff81085b0f>] kthread+0xcf/0xe0
Dec 12 03:21:59 localhost kernel: [<ffffffff81085a40>] ? kthread_create_on_node+0x140/0x140
Dec 12 03:21:59 localhost kernel: [<ffffffff815fc8ec>] ret_from_fork+0x7c/0xb0
Dec 12 03:21:59 localhost kernel: [<ffffffff81085a40>] ? kthread_create_on_node+0x140/0x140
Dec 12 03:21:59 localhost kernel: two             D ffff88007fd94580     0  3581      2 0x00000080
Dec 12 03:21:59 localhost kernel: ffff880034835d50 0000000000000046 ffff880034835fd8 0000000000014580
Dec 12 03:21:59 localhost kernel: ffff880034835fd8 0000000000014580 ffff88007b1ac440 ffffffffa0536280
Dec 12 03:21:59 localhost kernel: 7fffffffffffffff ffff88007b1ac440 0000000000000002 0000000000000000
Dec 12 03:21:59 localhost kernel: Call Trace:
Dec 12 03:21:59 localhost kernel: [<ffffffff815f1a39>] schedule+0x29/0x70
Dec 12 03:21:59 localhost kernel: [<ffffffff815ef939>] schedule_timeout+0x209/0x2d0
Dec 12 03:21:59 localhost kernel: [<ffffffff8106e37b>] ? lock_timer_base.isra.35+0x2b/0x50
Dec 12 03:21:59 localhost kernel: [<ffffffff8106f06e>] ? try_to_del_timer_sync+0x5e/0x90
Dec 12 03:21:59 localhost kernel: [<ffffffff815f0f32>] __down_common+0xd2/0x14a
Dec 12 03:21:59 localhost kernel: [<ffffffffa0534090>] ? thread_function_one+0x90/0x90 [threads]
Dec 12 03:21:59 localhost kernel: [<ffffffff815f0fc7>] __down+0x1d/0x1f
Dec 12 03:21:59 localhost kernel: [<ffffffff8108b8c1>] down+0x41/0x50
Dec 12 03:21:59 localhost kernel: [<ffffffffa0534103>] thread_function_two+0x73/0x90 [threads]
Dec 12 03:21:59 localhost kernel: [<ffffffff81085b0f>] kthread+0xcf/0xe0
Dec 12 03:21:59 localhost kernel: [<ffffffff81085a40>] ? kthread_create_on_node+0x140/0x140
Dec 12 03:21:59 localhost kernel: [<ffffffff815fc8ec>] ret_from_fork+0x7c/0xb0
Dec 12 03:21:59 localhost kernel: [<ffffffff81085a40>] ? kthread_create_on_node+0x140/0x140


We see that they are both waiting at __down_common which is the semaphore acquiring function.


Now we can use linux crash tool to view the same. Linux crash tool has the advantage of seeing the values of stack variables.

A live system can be debugged using the crash utility. Following is the command for same : 
1. crash /usr/lib/debug/lib/modules/3.15.X.XX-XXX.XX.X.el7.x86_64/vmlinux
If we want to debug a core generated because of kernel crash we need to give the same command with core generated.
2. crash /usr/lib/debug/lib/modules/3.15.X.XX-XXX.XX.X.el7.x86_64/vmlinux /path/to/vmcore

The first command will give a prompt to start the live debugging of kernel.
1. ps command : We can see the current threads run by the linux kernel using the crash ps command. This is similar to the shell ps sommand but give task_struct address for the running threads.

2. set command : We can set the context of any thread using the set command

crash> set ffff88007b1aa220
    PID: 3580
COMMAND: "one"
   TASK: ffff88007b1aa220  [THREAD_INFO: ffff880079572000]
    CPU: 1
  STATE: TASK_UNINTERRUPTIBLE

crash> set ffff88007b1ac440
    PID: 3581
COMMAND: "two"
   TASK: ffff88007b1ac440  [THREAD_INFO: ffff880034834000]
    CPU: 3
  STATE: TASK_UNINTERRUPTIBLE

3. bt command : For getting the thread stack of a context we can use the "bt" command. bt --> backtrace
crash> bt
PID: 3580   TASK: ffff88007b1aa220  CPU: 1   COMMAND: "one"
 #0 [ffff880079573cf0] __schedule at ffffffff815f14fd
 #1 [ffff880079573d58] schedule at ffffffff815f1a39
 #2 [ffff880079573d68] schedule_timeout at ffffffff815ef939
 #3 [ffff880079573e10] __down_common at ffffffff815f0f32
 #4 [ffff880079573e88] __down at ffffffff815f0fc7
 #5 [ffff880079573e98] down at ffffffff8108b8c1
 #6 [ffff880079573eb8] thread_function_one at ffffffffa0534073 [threads]
 #7 [ffff880079573ec8] kthread at ffffffff81085b0f
 #8 [ffff880079573f50] ret_from_fork at ffffffff815fc8ec
crash> bt
PID: 3581   TASK: ffff88007b1ac440  CPU: 3   COMMAND: "two"
 #0 [ffff880034835cf0] __schedule at ffffffff815f14fd
 #1 [ffff880034835d58] schedule at ffffffff815f1a39
 #2 [ffff880034835d68] schedule_timeout at ffffffff815ef939
 #3 [ffff880034835e10] __down_common at ffffffff815f0f32
 #4 [ffff880034835e88] __down at ffffffff815f0fc7
 #5 [ffff880034835e98] down at ffffffff8108b8c1
 #6 [ffff880034835eb8] thread_function_two at ffffffffa0534103 [threads]
 #7 [ffff880034835ec8] kthread at ffffffff81085b0f
 #8 [ffff880034835f50] ret_from_fork at ffffffff815fc8ec

4. bt -f : We also might need to get the value of local variables in the current context. The addresses of the variables can be fetched using the "bt -f" command.
Lets see the output of this for our hung thread:

crash> bt -f
PID: 3581   TASK: ffff88007b1ac440  CPU: 3   COMMAND: "two"
 #0 [ffff880034835cf0] __schedule at ffffffff815f14fd
    ffff880034835cf8: 0000000000000046 ffff880034835fd8
    ffff880034835d08: 0000000000014580 ffff880034835fd8
    ffff880034835d18: 0000000000014580 ffff88007b1ac440
    ffff880034835d28: ffffffffa0536280 7fffffffffffffff
    ffff880034835d38: ffff88007b1ac440 0000000000000002
    ffff880034835d48: 0000000000000000 ffff880034835d60
    ffff880034835d58: ffffffff815f1a39
 #1 [ffff880034835d58] schedule at ffffffff815f1a39
    ffff880034835d60: ffff880034835e08 ffffffff815ef939
 #2 [ffff880034835d68] schedule_timeout at ffffffff815ef939
    ffff880034835d70: 7fffffffffffffff ffff880034835da8
    ffff880034835d80: ffffffff8106e37b ffff880034835e10
    ffff880034835d90: ffff88007c514000 0000000000000286
    ffff880034835da0: 0000000000000000 ffff880034835dd8
    ffff880034835db0: ffffffff8106f06e 0000000000000286
    ffff880034835dc0: 000000001daef83c ffff880034835e10
    ffff880034835dd0: ffff880034835e10 000000001daef83c
    ffff880034835de0: ffffffffa0536280 7fffffffffffffff
    ffff880034835df0: ffff88007b1ac440 0000000000000002
    ffff880034835e00: 0000000000000000 ffff880034835e80
    ffff880034835e10: ffffffff815f0f32
 #3 [ffff880034835e10] __down_common at ffffffff815f0f32
    ffff880034835e18: 0000000000000000 ffffffffa0536288
    ffff880034835e28: ffffffffa0536288 ffff88007b1ac440
    ffff880034835e38: ffff88007b1ac400 ffffffffffffffff
    ffff880034835e48: 000000001daef83c 0000000000000000
    ffff880034835e58: ffffffffa0536280 ffffffffa0536240
    ffff880034835e68: ffffffffa0534090 0000000000000000
    ffff880034835e78: 0000000000000000 ffff880034835e90
    ffff880034835e88: ffffffff815f0fc7
 #4 [ffff880034835e88] __down at ffffffff815f0fc7
    ffff880034835e90: ffff880034835eb0 ffffffff8108b8c1
 #5 [ffff880034835e98] down at ffffffff8108b8c1
    ffff880034835ea0: 0000000000000286 ffff880079ac7cb8
    ffff880034835eb0: ffff880034835ec0 ffffffffa0534103
 #6 [ffff880034835eb8] thread_function_two at ffffffffa0534103 [threads]
    ffff880034835ec0: ffff880034835f48 ffffffff81085b0f
 #7 [ffff880034835ec8] kthread at ffffffff81085b0f
    ffff880034835ed0: 0000000000000000 0000000000000000
    ffff880034835ee0: ffffffffa0536240 0000000000000000
    ffff880034835ef0: 0000000000000000 ffff880034835ef8
    ffff880034835f00: ffff880034835ef8 ffff880000000000
    ffff880034835f10: ffff880000000000 ffff880034835f18
    ffff880034835f20: ffff880034835f18 000000001daef83c
    ffff880034835f30: ffffffff81085a40 0000000000000000
    ffff880034835f40: 0000000000000000 ffff880079ac7cb8
    ffff880034835f50: ffffffff815fc8ec
 #8 [ffff880034835f50] ret_from_fork at ffffffff815fc8ec

Here the addresses are pointing to the local variables. These are hell lot of addresses, how to trace our variable. Generally because of optimizations these variables appear at differenet places in the stack.
So what I did at the start of our modules is print the addresses of these variables : ;
Dec 12 02:49:25 localhost kernel: module_init address of sync1 = ffffffffa0536280 sync2 = ffffffffa0536260


5. Fetching the value of variables :
Once the variable address is sure to us we can just give commands in following fashion, to fetch the values of variable.

crash> struct semaphore ffffffffa0536280
struct semaphore {
  lock = {
    raw_lock = {
      {
        head_tail = 262148,
        tickets = {
          head = 4,
          tail = 4
        }
      }
    }
  },
  count = 0,
  wait_list = {
    next = 0xffff880034835e20,
    prev = 0xffff880034835e20
  }
}

crash> struct semaphore ffffffffa0536260
struct semaphore {
  lock = {
    raw_lock = {
      {
        head_tail = 262148,
        tickets = {
          head = 4,
          tail = 4
        }
      }
    }
  },
  count = 0,
  wait_list = {
    next = 0xffff880079573e20,
    prev = 0xffff880079573e20
  }
}

No comments:

Post a Comment