Articles in this blog
Tuesday, 3 August 2021
Linux fiemap internals
Monday, 10 May 2021
Linux libaio example and ftrace
LIBAIO
#include <linux/aio_abi.h>
#include <sys/syscall.h>
#include <linux/aio_abi.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <inttypes.h>
#include <assert.h>
typedef unsigned long uintptr_t;
int fd = 0;
int fd1 = 0;
// first operation
char buff1[512];
struct iocb iocb1 = {0};
// second operation
char buff2[512];
struct iocb iocb2 = {0};
aio_context_t ioctx = 0;
unsigned maxevents = 128;
// syscall wrappers
static inline int
io_setup(unsigned maxevents, aio_context_t *ctx) {
return syscall(SYS_io_setup, maxevents, ctx);
}
static inline int
io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp) {
return syscall(SYS_io_submit, ctx, nr, iocbpp);
}
static inline int
io_getevents(aio_context_t ctx, long min_nr, long nr,
struct io_event *events, struct timespec *timeout) {
return syscall(SYS_io_getevents, ctx, min_nr, nr, events, timeout);
}
static inline int
io_destroy(aio_context_t ctx)
{
return syscall(SYS_io_destroy, ctx);
}
void fill_iocb_1(int local_fd)
{
iocb1.aio_data = 0xbeef; // will be returned in completion
iocb1.aio_fildes = local_fd;
iocb1.aio_lio_opcode = IOCB_CMD_PREAD;
iocb1.aio_reqprio = 0;
iocb1.aio_buf = (uintptr_t)buff1;
iocb1.aio_nbytes = sizeof(buff1);
iocb1.aio_offset = 0; // read file at offset 0
}
void fill_iocb_2(int local_fd)
{
iocb2.aio_data = 0xbaba; // will be returned in completion
iocb2.aio_fildes = local_fd;
iocb2.aio_lio_opcode = IOCB_CMD_PREAD;
iocb2.aio_reqprio = 0;
iocb2.aio_buf = (uintptr_t)buff2;
iocb2.aio_nbytes = sizeof(buff2);
iocb2.aio_offset = 4096; // read file at offset 4096 (bytes)
}
int main()
{
struct iocb *iocb_ptrs[2] = { &iocb1, &iocb2 };
size_t nevents = 2;
struct io_event events[nevents];
int ret = 0;
nevents = 2;
fd = open("/root/test", O_RDWR|O_DIRECT);
if(fd < 0)
{
perror("file open");
exit(1);
}
ioctx = 0;
if (io_setup(maxevents, &ioctx) < 0) {
perror("io_setup");
exit(1);
}
else {
printf("io_setup is successsful\n");
}
fill_iocb_1(fd);
fill_iocb_2(fd);
// submit operations
ret = io_submit(ioctx, 2, iocb_ptrs);
if (ret < 0) {
perror("io_submit");
exit(1);
} else if (ret != 2) {
perror("io_submit: unhandled partial success");
exit(1);
}
ret = io_getevents(ioctx, 1 /* min */, nevents, events, NULL);
if (ret < 0) {
perror("io_getevents");
exit(1);
}
printf("Got %d events\n", ret);
for (size_t i=0; i<ret; i++) {
struct io_event *ev = &events[i];
assert(ev->data == 0xbeef || ev->data == 0xbaba);
printf("Event returned with res=%lld res2=%lld\n", ev->res, ev->res2);
nevents--;
if (ev->res < 0 ){
printf("Error \n");
}
if(i == 0)
{
printf("Data = \n");
for (int j = 0; j < 512; j++)
printf("%d\n", buff1[j]);
printf("\n");
}
}
io_destroy(ioctx);
close(fd);
return 0;
}
Lets ftrace the io_submit and io_getevents functions :
cd /sys/kernel/debug/tracing
cat /dev/null > trace
echo __x64_sys_io_submit > set_graph_function
echo 10 > max_graph_depth
echo function_graph > current_tracer
echo 1 > tracing_on
/root/libaio/./a.out
cp trace ~/io_submit_dio_trace_depth_10_2
echo 0 > tracing_on
echo > set_graph_function
echo 0 > max_graph_depth
cat /dev/null > trace
https://stackoverflow.com/questions/34235892/should-libaio-engine-to-be-used-with-unbuffered-io-directonly
gcc -D_GNU_SOURCE my_aio.c
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
11) | __x64_sys_io_submit() {
11) | lookup_ioctx() {
11) | do_page_fault() {
11) | __do_page_fault() {
11) 0.307 us | down_read_trylock();
11) | find_vma() {
11) 0.433 us | vmacache_find();
11) 0.270 us | vmacache_update();
11) 1.626 us | }
11) | handle_mm_fault() {
11) 0.319 us | mem_cgroup_from_task();
11) 0.264 us | __count_memcg_events();
11) | __handle_mm_fault() {
11) 0.330 us | pmd_devmap_trans_unstable();
11) | do_fault() {
11) | filemap_map_pages() {
11) | alloc_set_pte() {
11) 0.322 us | pmd_devmap_trans_unstable();
11) 0.332 us | _raw_spin_lock();
11) 1.027 us | page_add_file_rmap();
11) 3.002 us | }
11) 0.346 us | unlock_page();
11) | alloc_set_pte() {
11) 0.801 us | page_add_file_rmap();
11) 1.358 us | }
11) 0.282 us | unlock_page();
11) | alloc_set_pte() {
11) 0.791 us | page_add_file_rmap();
11) 1.336 us | }
11) 0.270 us | unlock_page();
11) 8.944 us | }
11) 9.706 us | }
11) + 11.120 us | }
11) + 12.799 us | }
11) 0.266 us | up_read();
11) + 17.076 us | }
11) + 17.596 us | }
11) + 19.061 us | }
11) | io_submit_one() {
11) | kmem_cache_alloc() {
11) 0.358 us | should_failslab();
11) 0.960 us | }
11) 0.334 us | __get_reqs_available();
11) | fget() {
11) 0.300 us | __fget();
11) 0.953 us | }
11) | aio_read() {
11) 0.755 us | aio_prep_rw();
11) 0.545 us | aio_setup_rw();
11) | rw_verify_area() {
11) | security_file_permission() {
11) | apparmor_file_permission() {
11) | common_file_perm() {
11) 0.285 us | aa_file_perm();
11) 0.866 us | }
11) 1.408 us | }
11) 0.292 us | __fsnotify_parent();
11) 0.319 us | fsnotify();
11) 3.108 us | }
11) 3.707 us | }
11) | generic_file_read_iter() {
11) | filemap_write_and_wait_range() {
11) 0.392 us | __filemap_fdatawrite_range();
11) | __filemap_fdatawait_range() {
11) | pagevec_lookup_range_tag() {
11) 0.760 us | find_get_pages_range_tag();
11) 1.475 us | }
11) 2.040 us | }
11) 0.313 us | filemap_check_errors();
11) 4.029 us | }
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.263 us | ktime_get_coarse_real_ts64();
11) 0.266 us | timespec64_trunc();
11) 1.393 us | }
11) 1.980 us | }
11) 2.514 us | }
11) 0.690 us | btrfs_direct_IO [btrfs]();
11) | _cond_resched() {
11) 0.335 us | rcu_all_qs();
11) 0.859 us | }
11) | pagecache_get_page() {
11) 0.574 us | find_get_entry();
11) 2.355 us | }
11) 0.340 us | mark_page_accessed();
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.268 us | ktime_get_coarse_real_ts64();
11) 0.268 us | timespec64_trunc();
11) 1.318 us | }
11) 1.876 us | }
11) 2.436 us | }
11) + 17.043 us | }
11) 0.429 us | aio_complete_rw();
11) 0.295 us | kfree();
11) + 25.152 us | }
11) 0.276 us | _raw_spin_lock_irqsave();
11) 0.274 us | _raw_spin_unlock_irqrestore();
11) | fput() {
11) 0.287 us | fput_many();
11) 0.786 us | }
11) 0.547 us | kmem_cache_free();
11) + 32.749 us | }
11) | io_submit_one() {
11) | kmem_cache_alloc() {
11) 0.266 us | should_failslab();
11) 0.799 us | }
11) 0.296 us | __get_reqs_available();
11) | fget() {
11) 0.328 us | __fget();
11) 0.823 us | }
11) | aio_read() {
11) 0.366 us | aio_prep_rw();
11) 0.294 us | aio_setup_rw();
11) | rw_verify_area() {
11) | security_file_permission() {
11) | apparmor_file_permission() {
11) | common_file_perm() {
11) 0.330 us | aa_file_perm();
11) 0.859 us | }
11) 1.359 us | }
11) 0.276 us | __fsnotify_parent();
11) 0.285 us | fsnotify();
11) 3.073 us | }
11) 3.628 us | }
11) | generic_file_read_iter() {
11) | filemap_write_and_wait_range() {
11) 0.281 us | __filemap_fdatawrite_range();
11) | __filemap_fdatawait_range() {
11) | pagevec_lookup_range_tag() {
11) 0.401 us | find_get_pages_range_tag();
11) 0.909 us | }
11) 1.440 us | }
11) 0.273 us | filemap_check_errors();
11) 3.073 us | }
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.268 us | ktime_get_coarse_real_ts64();
11) 0.268 us | timespec64_trunc();
11) 1.388 us | }
11) 1.974 us | }
11) 2.546 us | }
11) 0.311 us | btrfs_direct_IO [btrfs]();
11) 7.110 us | }
11) 0.344 us | aio_complete_rw();
11) 0.308 us | kfree();
11) + 14.100 us | }
11) 0.322 us | _raw_spin_lock_irqsave();
11) 0.304 us | refill_reqs_available();
11) 0.344 us | _raw_spin_unlock_irqrestore();
11) | fput() {
11) 0.265 us | fput_many();
11) 0.804 us | }
11) 0.293 us | kmem_cache_free();
11) + 21.363 us | }
11) + 76.921 us | }
Saturday, 8 May 2021
Linux io_uring example and internals
Linux io_uring basics and details can be fetched from here https://kernel.dk/io_uring.pdf. Reader is encouraged to complete this document first.
Also various examples of io_uring are mentioned here:
https://github.com/shuveb/io_uring-by-example
I am using 5.3.18-22 kernel in this blog.
Lets see the program to do read of twenty(20) 512bytes blocks from a block device.
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <linux/fs.h>
#include <unistd.h>
#include <string.h>
/* If your compilation fails because the header file below is missing,
* your kernel is probably too old to support io_uring.
#include <linux/io_uring.h>
#define QUEUE_DEPTH 1
#define BLOCK_SZ 512
/* This is x86 specific */
#define read_barrier() __asm__ __volatile__("":::"memory")
#define write_barrier() __asm__ __volatile__("":::"memory")
struct app_io_sq_ring {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
unsigned *flags;
unsigned *array;
};
struct app_io_cq_ring {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
struct io_uring_cqe *cqes;
};
struct submitter {
int ring_fd;
struct app_io_sq_ring sq_ring;
struct io_uring_sqe *sqes;
struct app_io_cq_ring cq_ring;
};
int io_uring_setup(unsigned entries, struct io_uring_params *p)
{
return (int) syscall(__NR_io_uring_setup, entries, p);
}
int io_uring_enter(int ring_fd, unsigned int to_submit,
unsigned int min_complete, unsigned int flags)
{
return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete,
flags, NULL, 0);
}
struct io_info {
int num_io;
struct iovec iovecs[];
};
struct file_info {
off_t file_sz;
struct iovec iovecs[]; /* Referred by readv/writev */
};
int submit_to_sq(char *file_path, struct submitter *s) {
struct io_info *ii;
struct stat st;
int file_fd = open(file_path, O_RDONLY);
if (file_fd < 0 ) {
perror("open");
return 1;
}
struct app_io_sq_ring *sring = &s->sq_ring;
unsigned index = 0, tail = 0, next_tail = 0;
off_t file_sz = 0;
// fetch the block device size
if(fstat(file_fd, &st) < 0) {
perror("fstat");
return -1;
}
if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(file_fd, BLKGETSIZE64, &bytes) != 0) {
perror("ioctl");
return -1;
}
file_sz = bytes;
} else if (S_ISREG(st.st_mode))
file_sz = st.st_size;
printf("sz = %llu\n", file_sz);
if (file_sz < 0)
return 1;
//Lets submit IO to read first 20 blocks
struct iovec *iovecs;//[20];
ii = malloc(sizeof(*ii) + sizeof(struct iovec) * 20);
memset (ii, 0, sizeof(*ii) + (sizeof(struct iovec) * 20));
iovecs = ii->iovecs;
ii->num_io = 20;
for (int i = 0;i < 20;i++)
{
iovecs[i].iov_len = 512;
void *buf;
if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) {
perror("posix_memalign");
return 1;
}
iovecs[i].iov_base = buf;
}
/* Add our submission queue entry to the tail of the SQE ring buffer */
read_barrier();
index = tail & *s->sq_ring.ring_mask;
struct io_uring_sqe *sqe = &s->sqes[index];
sqe->fd = file_fd;
sqe->flags = 0;
sqe->opcode = IORING_OP_READV;
sqe->addr = (unsigned long) iovecs;
sqe->len = 20;
sqe->off = 0;
sqe->user_data = (unsigned long long) ii;
sring->array[index] = index;
tail = next_tail;
/* Update the tail so the kernel can see it. */
if(*sring->tail != tail) {
*sring->tail = tail;
write_barrier();
}
int ret = io_uring_enter(s->ring_fd, 1, 1, IORING_ENTER_GETEVENTS);
if(ret < 0) {
perror("io_uring_enter");
return 1;
}
return 0;
}
void read_from_cq(struct submitter *s) {
struct iovec *iovecs;
struct io_info *ii;
struct app_io_cq_ring *cring = &s->cq_ring;
struct io_uring_cqe *cqe;
unsigned head, reaped = 0;
head = *cring->head;
do {
read_barrier();
/*
* Remember, this is a ring buffer. If head == tail, it means that the
* buffer is empty.
* */
if (head == *cring->tail)
break;
/* Get the entry */
cqe = &cring->cqes[head & *s->cq_ring.ring_mask];
ii = (struct io_info*) cqe->user_data;
iovecs = ii->iovecs;
if (cqe->res < 0)
fprintf(stderr, "Error: %s\n", strerror(abs(cqe->res)));
for (int i = 0; i < 20; i++)
{
printf("iov_base = %p, iov_len = %d\n",
iovecs[i].iov_base, iovecs[i].iov_len);
printf("%d, %d\n", *((char *)iovecs[i].iov_base) , *((char *)iovecs[i].iov_base + 1));
}
head++;
} while (1);
cring->head = head;
write_barrier();
}
int app_setup_uring(struct submitter *s) {
struct app_io_sq_ring *sring = &s->sq_ring;
struct app_io_cq_ring *cring = &s->cq_ring;
struct io_uring_params p;
void *sq_ptr, *cq_ptr;
memset(&p, 0, sizeof(p));
s->ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
if (s->ring_fd < 0) {
perror("io_uring_setup");
return 1;
}
// Fetch and decide on submission and completion ring sizes
int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
printf("sring_sz = %d, cring_sz = %d\n", sring_sz, cring_sz);
// mmap the submission ring
sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
s->ring_fd, IORING_OFF_SQ_RING);
if (sq_ptr == MAP_FAILED) {
perror("mmap");
return 1;
}
// mmap the completion ring
/* Map in the completion queue ring buffer in older kernels separately */
cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
s->ring_fd, IORING_OFF_CQ_RING);
if (cq_ptr == MAP_FAILED) {
return 1;
}
/* Save useful fields in a global app_io_sq_ring struct for later easy reference */
sring->head = sq_ptr + p.sq_off.head;
sring->tail = sq_ptr + p.sq_off.tail;
sring->ring_mask = sq_ptr + p.sq_off.ring_mask;
sring->ring_entries = sq_ptr + p.sq_off.ring_entries;
sring->flags = sq_ptr + p.sq_off.flags;
sring->array = sq_ptr + p.sq_off.array;
/* Map in the submission queue entries array */
s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
s->ring_fd, IORING_OFF_SQES);
if (s->sqes == MAP_FAILED) {
perror("mmap");
return 1;
}
/* Save useful fields in a global app_io_cq_ring struct for later easy reference */
cring->head = cq_ptr + p.cq_off.head;
cring->tail = cq_ptr + p.cq_off.tail;
cring->ring_mask = cq_ptr + p.cq_off.ring_mask;
cring->ring_entries = cq_ptr + p.cq_off.ring_entries;
cring->cqes = cq_ptr + p.cq_off.cqes;
return 0;
}
int main(int argc, char *argv[]) {
struct submitter *s;
s = malloc(sizeof(*s));
if (!s) {
perror("malloc");
return 1;
}
memset(s, 0, sizeof(*s));
if(app_setup_uring(s)) {
fprintf(stderr, "Unable to setup uring!\n");
return 1;
}
printf("setup completed\n");
// Open the block device and read the data
printf("submit cq completed\n");
read_from_cq(s);
printf("read cq completed\n");
}
Tuesday, 2 March 2021
Kprobe Example
Monday, 15 February 2021
Linux RCU Usage and internals
For developing the understanding of Linux RCU we shall first go ahead with the the Paul McKenney's explanation on YouTube.
https://www.youtube.com/watch?v=obDzjElRj9c
This helps a lot in understanding the concept behind RCU.
Next we try to execute RCU examples from here : https://www.kernel.org/doc/html/latest/RCU/whatisRCU.html#what-are-some-example-uses-of-core-rcu-api
Sample kernel module to test the core APIS of RCU :
#include<linux/version.h>
#include<linux/kernel.h>
#include<linux/init.h>
#include<linux/kprobes.h>
#include<linux/spinlock.h>
#include<linux/slab.h>
#include<linux/rcupdate.h>
struct foo {
int a;
char b;
long c;
};
DEFINE_SPINLOCK(foo_mutex);
struct foo __rcu *gbl_foo;
void foo_init_a(int new_a)
{
struct foo *fp = NULL;
fp = kmalloc(sizeof(*fp), GFP_KERNEL);
fp->a = new_a;
gbl_foo = fp;
}
void foo_update_a(int new_a)
{
struct foo *new_fp;
struct foo *old_fp;
new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
spin_lock(&foo_mutex);
old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
*new_fp = *old_fp;
new_fp->a = new_a;
rcu_assign_pointer(gbl_foo, new_fp);
printk("updated pointer\n");
spin_unlock(&foo_mutex);
printk("synchronize rcu\n");
synchronize_rcu();
kfree(old_fp);
}
int foo_get_a(void)
{
int retval;
rcu_read_lock();
retval = rcu_dereference(gbl_foo)->a;
rcu_read_unlock();
printk("%s, %d fetched val is %d\n", __func__, __LINE__, retval);
return retval;
}
void foo_del_a(void)
{
if(gbl_foo != NULL)
kfree(gbl_foo);
}
int myinit(void)
{
printk("module inserted\n");
foo_init_a(70);
foo_get_a();
foo_get_a();
foo_update_a(20);
foo_get_a();
foo_update_a(30);
foo_get_a();
foo_del_a();
return 0;
}
void myexit(void)
{
printk("module removed\n");
}
module_init(myinit);
module_exit(myexit);
MODULE_AUTHOR("K_K");
MODULE_DESCRIPTION("RCU MODULE");
MODULE_LICENSE("GPL");