Wednesday, 9 December 2015

Linux Interrupt handling for x86 systems


Lets see how the interrupts are handled in x86 system.
For this depiction I have used Linux 3.15 kernel source.

The do_IRQ function is called from the arch/x86/kernel/entry_64.S
...
common_interrupt:
XCPT_FRAME
ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
...

do_IRQ implementation
__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);

/* high bit used in ret_from_ code  */
unsigned vector = ~regs->orig_ax;
unsigned irq;

irq_enter();
exit_idle();

irq = __this_cpu_read(vector_irq[vector]);

if (!handle_irq(irq, regs)) {
ack_APIC_irq();    ---- End of Interrupt

if (irq != VECTOR_RETRIGGERED) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
    __func__, smp_processor_id(),
    vector, irq);
} else {
__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
}
}

irq_exit();

set_irq_regs(old_regs);
return 1;
}

The do_IRQ implementation first enters into the interrupt context using function irq_enter(). (arch/x86/kernel/irq.c)
The function irq_enter does many stuffs with timers and updates the jiffies.
The function also increments the preempt count : preempt_count_add(HARDIRQ_OFFSET); in __irq_enter()

Then it calls exit_idle to end the idle loop.

Taking the irq number 
Then the do_IRQ function reads the IRQ number from this CPU's vector array.
unsigned vector = ~regs->orig_ax;

irq = __this_cpu_read(vector_irq[vector]);

Now things are set to execute the interrupt handler. It is done in handle_irq function.
bool handle_irq(unsigned irq, struct pt_regs *regs)
{
struct irq_desc *desc;

stack_overflow_check(regs);

desc = irq_to_desc(irq);
if (unlikely(!desc))
return false;

generic_handle_irq_desc(irq, desc);
return true;
}

This function just fetches the interrupt descriptor and calls generic_handle_irq_desc.

static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
{
desc->handle_irq(irq, desc);
}

This function calls handle_irq for this descriptor:
desc->handle_irq(irq, desc);

The handle_irq would have been set by irq_set_chip_and_handler_name at the time of initialization.
Generally this will point to handle_edge_irq or handle_level_irq. These functions are set depending upon if the interrupt getting handled is level triggered or edge.

Difference between Edge triggered and Level Triggered interrupt handling 

The edge triggered interrupts need not be masked when it is getting handled, whereas the level triggered interrupts need to be masked.

handle_edge_irq
void
handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);

desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
/*
* If we're currently running this IRQ, or its disabled,
* we shouldn't process the IRQ. Mark it pending, handle
* the necessary masking and go out
*/
if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
    irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
if (!irq_check_poll(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
goto out_unlock;
}
}
kstat_incr_irqs_this_cpu(irq, desc);

/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);

do {
if (unlikely(!desc->action)) {
mask_irq(desc);
goto out_unlock;
}

/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
* Renable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
   irqd_irq_masked(&desc->irq_data))
unmask_irq(desc);
}

handle_irq_event(desc);

} while ((desc->istate & IRQS_PENDING) &&
!irqd_irq_disabled(&desc->irq_data));

out_unlock:
raw_spin_unlock(&desc->lock);
}

EXPORT_SYMBOL(handle_edge_irq);

[BOOKMARK]As, the edge triggered interrupt are not masked they can appear on another CPU even if they are getting handled on one CPU.
The first part of function handle_edge_irq checks if it is running on another CPU then it masks the interrupt and sends and acknowledgment for it.

if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
    irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
if (!irq_check_poll(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
goto out_unlock;
}
}

Next  handle_edge_irq sends an ack to interrupt controller.
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);


Next there is a loop to handle all the interrupt requests which come for this irq.

do {
if (unlikely(!desc->action)) {
mask_irq(desc);
goto out_unlock;
}

/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
* Renable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
   irqd_irq_masked(&desc->irq_data))
unmask_irq(desc);
}

handle_irq_event(desc);

} while ((desc->istate & IRQS_PENDING) &&
!irqd_irq_disabled(&desc->irq_data));


Here it checks for another pending interrupt(which might have come on another CPU) and executes it. See [BOOKMARK]


handle_level_irq :
void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);

if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
if (!irq_check_poll(desc))
goto out_unlock;

desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);

/*
* If its disabled or no action available
* keep it masked and get out of here
*/
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
goto out_unlock;
}

handle_irq_event(desc);

cond_unmask_irq(desc);

out_unlock:
raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);

As this function has to run with all the interrupts masked. There are less race conditions than handle_edge_irq
This function just calls mask_ack_irq(desc);

handle_irq_event : This function runs all the handlers registered with this interrupt

do {
irqreturn_t res;

trace_irq_handler_entry(irq, action);
res = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, res);

if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
     irq, action->handler))
local_irq_disable();

switch (res) {
case IRQ_WAKE_THREAD:
/*
* Catch drivers which return WAKE_THREAD but
* did not set up a thread function
*/
if (unlikely(!action->thread_fn)) {
warn_no_thread(irq, action);
break;
}

__irq_wake_thread(desc, action);

/* Fall through to add to randomness */
case IRQ_HANDLED:
flags |= action->flags;
break;

default:
break;
}

retval |= res;
action = action->next;
} while (action);


End of Interrupt : The APIC on x86 systems has to send the #EOI signal on the PCI bus to tell that interrupt processing is finished. This is done just after the irq is handled in do_IRQ function.

do_IRQ()
...
if (!handle_irq(irq, regs)) {
ack_APIC_irq();
...



Following ftrace graph on do_IRQ function will tell you the sequence of events :
1)   ==========> |
 1)               |  do_IRQ() {
 1)               |    irq_enter() {
 1)               |      rcu_irq_enter() {
 1)   0.201 us    |        rcu_eqs_exit_common.isra.45();
 1)   1.223 us    |      }
 1)               |      tick_irq_enter() {
 1)   0.190 us    |        tick_check_oneshot_broadcast_this_cpu();
 1)               |        ktime_get() {
 1)   1.192 us    |          read_hpet();
 1)   2.174 us    |        }
 1)               |        update_ts_time_stats() {
 1)   0.175 us    |          nr_iowait_cpu();
 1)   1.217 us    |        }
 1)               |        ktime_get() {
 1)   1.057 us    |          read_hpet();
 1)   2.019 us    |        }
 1)   0.135 us    |        touch_softlockup_watchdog();
 1)               |        tick_do_update_jiffies64() {
 1)   0.185 us    |          _raw_spin_lock();
 1)               |          do_timer() {
 1)   0.140 us    |            calc_global_load();
 1)   1.162 us    |          }
 1)   0.135 us    |          _raw_spin_unlock();
 1)               |          update_wall_time() {
 1)   0.200 us    |            _raw_spin_lock_irqsave();
 1)   0.967 us    |            read_hpet();
 1)   0.140 us    |            ntp_tick_length();
 1)   0.136 us    |            ntp_tick_length();
 1)   0.136 us    |            ntp_tick_length();
 1)               |            timekeeping_update.constprop.8() {
 1)   0.290 us    |              update_vsyscall();
 1)               |              raw_notifier_call_chain() {
 1)   0.246 us    |                notifier_call_chain();
 1)   1.193 us    |              }
 1)   3.526 us    |            }
 1)   0.191 us    |            _raw_spin_unlock_irqrestore();
 1) + 11.778 us   |          }
 1) + 16.802 us   |        }
 1)   0.135 us    |        touch_softlockup_watchdog();
 1) + 29.011 us   |      }
 1)               |      _local_bh_enable() {
 1)   0.145 us    |        __local_bh_enable();
 1)   1.107 us    |      }
 1) + 34.723 us   |    }
 1)               |    exit_idle() {
 1)               |      atomic_notifier_call_chain() {
 1)   0.145 us    |        notifier_call_chain();
 1)   1.107 us    |      }
 1)   2.069 us    |    }
 1)               |    handle_irq() {
 1)   0.225 us    |      irq_to_desc();
 1)               |      handle_edge_irq() {
 1)   0.175 us    |        _raw_spin_lock();
 1)   0.180 us    |        irq_may_run();
 1)               |        apic_ack_edge() {
 1)   0.140 us    |          irq_complete_move();
 1)   0.140 us    |          irq_move_irq();
 1)   2.069 us    |        }
 1)               |        handle_irq_event() {
 1)   0.140 us    |          _raw_spin_unlock();
 1)               |          handle_irq_event_percpu() {
 1)               |            i8042_interrupt() {
 1)   0.231 us    |              _raw_spin_lock_irqsave();
 1)   0.155 us    |              _raw_spin_unlock_irqrestore();
 1)               |              serio_interrupt() {
 1)   0.205 us    |                _raw_spin_lock_irqsave();
 1)               |                psmouse_interrupt [psmouse]() {
 1)               |                  psmouse_handle_byte [psmouse]() {
 1)   0.315 us    |                    synaptics_process_byte [psmouse]();
 1)   1.303 us    |                  }
 1)   2.390 us    |                }
 1)   0.210 us    |                _raw_spin_unlock_irqrestore();
 1)   5.816 us    |              }
 1) + 14.528 us   |            }
 1)   0.340 us    |            add_interrupt_randomness();
 1)   0.196 us    |            note_interrupt();
 1) + 18.517 us   |          }
 1)   0.196 us    |          _raw_spin_lock();
 1) + 21.462 us   |        }
 1)   0.150 us    |        _raw_spin_unlock();
 1) + 28.525 us   |      }
 1) + 30.554 us   |    }
 1)               |    irq_exit() {
 1)   0.165 us    |      idle_cpu();
 1)               |      tick_nohz_irq_exit() {
 1)               |        __tick_nohz_idle_enter() {
 1)               |          ktime_get() {
 1)   1.298 us    |            read_hpet();
 1)   2.340 us    |          }
 1)   0.151 us    |          timekeeping_max_deferment();
 1)               |          get_next_timer_interrupt() {
 1)   0.206 us    |            _raw_spin_lock();
 1)   0.151 us    |            _raw_spin_unlock();
 1)               |            hrtimer_get_next_event() {
 1)   0.191 us    |              _raw_spin_lock_irqsave();
 1)   0.151 us    |              _raw_spin_unlock_irqrestore();
 1)   2.109 us    |            }
 1)   5.100 us    |          }
 1)               |          hrtimer_start() {
 1)               |            __hrtimer_start_range_ns() {
 1)               |              lock_hrtimer_base.isra.22() {
 1)   0.180 us    |                _raw_spin_lock_irqsave();
 1)   1.147 us    |              }
 1)   0.246 us    |              __remove_hrtimer();
 1)   0.135 us    |              get_nohz_timer_target();
 1)   0.321 us    |              enqueue_hrtimer();
 1)   0.156 us    |              _raw_spin_unlock_irqrestore();
 1)   6.373 us    |            }
 1)   7.329 us    |          }
 1) + 18.691 us   |        }
 1) + 19.658 us   |      }
 1)               |      rcu_irq_exit() {
 1)   0.225 us    |        rcu_eqs_enter_common.isra.44();
 1)   1.277 us    |      }
 1) + 23.781 us   |    }
 1) + 94.920 us   |  }
 1)   <========== |

1 comment: