LIBAIO
Lets see sample program which uses libaio and delve into the IO path for the same.
#include <linux/aio_abi.h>
#include <sys/syscall.h>
#include <linux/aio_abi.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <inttypes.h>
#include <assert.h>
typedef unsigned long uintptr_t;
int fd = 0;
int fd1 = 0;
// first operation
char buff1[512];
struct iocb iocb1 = {0};
// second operation
char buff2[512];
struct iocb iocb2 = {0};
aio_context_t ioctx = 0;
unsigned maxevents = 128;
// syscall wrappers
static inline int
io_setup(unsigned maxevents, aio_context_t *ctx) {
return syscall(SYS_io_setup, maxevents, ctx);
}
static inline int
io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp) {
return syscall(SYS_io_submit, ctx, nr, iocbpp);
}
static inline int
io_getevents(aio_context_t ctx, long min_nr, long nr,
struct io_event *events, struct timespec *timeout) {
return syscall(SYS_io_getevents, ctx, min_nr, nr, events, timeout);
}
static inline int
io_destroy(aio_context_t ctx)
{
return syscall(SYS_io_destroy, ctx);
}
void fill_iocb_1(int local_fd)
{
iocb1.aio_data = 0xbeef; // will be returned in completion
iocb1.aio_fildes = local_fd;
iocb1.aio_lio_opcode = IOCB_CMD_PREAD;
iocb1.aio_reqprio = 0;
iocb1.aio_buf = (uintptr_t)buff1;
iocb1.aio_nbytes = sizeof(buff1);
iocb1.aio_offset = 0; // read file at offset 0
}
void fill_iocb_2(int local_fd)
{
iocb2.aio_data = 0xbaba; // will be returned in completion
iocb2.aio_fildes = local_fd;
iocb2.aio_lio_opcode = IOCB_CMD_PREAD;
iocb2.aio_reqprio = 0;
iocb2.aio_buf = (uintptr_t)buff2;
iocb2.aio_nbytes = sizeof(buff2);
iocb2.aio_offset = 4096; // read file at offset 4096 (bytes)
}
int main()
{
struct iocb *iocb_ptrs[2] = { &iocb1, &iocb2 };
size_t nevents = 2;
struct io_event events[nevents];
int ret = 0;
nevents = 2;
fd = open("/root/test", O_RDWR|O_DIRECT);
if(fd < 0)
{
perror("file open");
exit(1);
}
ioctx = 0;
if (io_setup(maxevents, &ioctx) < 0) {
perror("io_setup");
exit(1);
}
else {
printf("io_setup is successsful\n");
}
fill_iocb_1(fd);
fill_iocb_2(fd);
// submit operations
ret = io_submit(ioctx, 2, iocb_ptrs);
if (ret < 0) {
perror("io_submit");
exit(1);
} else if (ret != 2) {
perror("io_submit: unhandled partial success");
exit(1);
}
ret = io_getevents(ioctx, 1 /* min */, nevents, events, NULL);
if (ret < 0) {
perror("io_getevents");
exit(1);
}
printf("Got %d events\n", ret);
for (size_t i=0; i<ret; i++) {
struct io_event *ev = &events[i];
assert(ev->data == 0xbeef || ev->data == 0xbaba);
printf("Event returned with res=%lld res2=%lld\n", ev->res, ev->res2);
nevents--;
if (ev->res < 0 ){
printf("Error \n");
}
if(i == 0)
{
printf("Data = \n");
for (int j = 0; j < 512; j++)
printf("%d\n", buff1[j]);
printf("\n");
}
}
io_destroy(ioctx);
close(fd);
return 0;
}
Lets ftrace the io_submit and io_getevents functions :
cd /sys/kernel/debug/tracing
cat /dev/null > trace
echo __x64_sys_io_submit > set_graph_function
echo 10 > max_graph_depth
echo function_graph > current_tracer
echo 1 > tracing_on
/root/libaio/./a.out
cp trace ~/io_submit_dio_trace_depth_10_2
echo 0 > tracing_on
echo > set_graph_function
echo 0 > max_graph_depth
cat /dev/null > trace
#include <linux/aio_abi.h>
#include <sys/syscall.h>
#include <linux/aio_abi.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <inttypes.h>
#include <assert.h>
typedef unsigned long uintptr_t;
int fd = 0;
int fd1 = 0;
// first operation
char buff1[512];
struct iocb iocb1 = {0};
// second operation
char buff2[512];
struct iocb iocb2 = {0};
aio_context_t ioctx = 0;
unsigned maxevents = 128;
// syscall wrappers
static inline int
io_setup(unsigned maxevents, aio_context_t *ctx) {
return syscall(SYS_io_setup, maxevents, ctx);
}
static inline int
io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp) {
return syscall(SYS_io_submit, ctx, nr, iocbpp);
}
static inline int
io_getevents(aio_context_t ctx, long min_nr, long nr,
struct io_event *events, struct timespec *timeout) {
return syscall(SYS_io_getevents, ctx, min_nr, nr, events, timeout);
}
static inline int
io_destroy(aio_context_t ctx)
{
return syscall(SYS_io_destroy, ctx);
}
void fill_iocb_1(int local_fd)
{
iocb1.aio_data = 0xbeef; // will be returned in completion
iocb1.aio_fildes = local_fd;
iocb1.aio_lio_opcode = IOCB_CMD_PREAD;
iocb1.aio_reqprio = 0;
iocb1.aio_buf = (uintptr_t)buff1;
iocb1.aio_nbytes = sizeof(buff1);
iocb1.aio_offset = 0; // read file at offset 0
}
void fill_iocb_2(int local_fd)
{
iocb2.aio_data = 0xbaba; // will be returned in completion
iocb2.aio_fildes = local_fd;
iocb2.aio_lio_opcode = IOCB_CMD_PREAD;
iocb2.aio_reqprio = 0;
iocb2.aio_buf = (uintptr_t)buff2;
iocb2.aio_nbytes = sizeof(buff2);
iocb2.aio_offset = 4096; // read file at offset 4096 (bytes)
}
int main()
{
struct iocb *iocb_ptrs[2] = { &iocb1, &iocb2 };
size_t nevents = 2;
struct io_event events[nevents];
int ret = 0;
nevents = 2;
fd = open("/root/test", O_RDWR|O_DIRECT);
if(fd < 0)
{
perror("file open");
exit(1);
}
ioctx = 0;
if (io_setup(maxevents, &ioctx) < 0) {
perror("io_setup");
exit(1);
}
else {
printf("io_setup is successsful\n");
}
fill_iocb_1(fd);
fill_iocb_2(fd);
// submit operations
ret = io_submit(ioctx, 2, iocb_ptrs);
if (ret < 0) {
perror("io_submit");
exit(1);
} else if (ret != 2) {
perror("io_submit: unhandled partial success");
exit(1);
}
ret = io_getevents(ioctx, 1 /* min */, nevents, events, NULL);
if (ret < 0) {
perror("io_getevents");
exit(1);
}
printf("Got %d events\n", ret);
for (size_t i=0; i<ret; i++) {
struct io_event *ev = &events[i];
assert(ev->data == 0xbeef || ev->data == 0xbaba);
printf("Event returned with res=%lld res2=%lld\n", ev->res, ev->res2);
nevents--;
if (ev->res < 0 ){
printf("Error \n");
}
if(i == 0)
{
printf("Data = \n");
for (int j = 0; j < 512; j++)
printf("%d\n", buff1[j]);
printf("\n");
}
}
io_destroy(ioctx);
close(fd);
return 0;
}
Lets ftrace the io_submit and io_getevents functions :
cd /sys/kernel/debug/tracing
cat /dev/null > trace
echo __x64_sys_io_submit > set_graph_function
echo 10 > max_graph_depth
echo function_graph > current_tracer
echo 1 > tracing_on
/root/libaio/./a.out
cp trace ~/io_submit_dio_trace_depth_10_2
echo 0 > tracing_on
echo > set_graph_function
echo 0 > max_graph_depth
cat /dev/null > trace
libaio and O_DIRECT
https://stackoverflow.com/questions/34235892/should-libaio-engine-to-be-used-with-unbuffered-io-directonly
https://stackoverflow.com/questions/34235892/should-libaio-engine-to-be-used-with-unbuffered-io-directonly
Compilation :
gcc -D_GNU_SOURCE my_aio.c
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
11) | __x64_sys_io_submit() {
11) | lookup_ioctx() {
11) | do_page_fault() {
11) | __do_page_fault() {
11) 0.307 us | down_read_trylock();
11) | find_vma() {
11) 0.433 us | vmacache_find();
11) 0.270 us | vmacache_update();
11) 1.626 us | }
11) | handle_mm_fault() {
11) 0.319 us | mem_cgroup_from_task();
11) 0.264 us | __count_memcg_events();
11) | __handle_mm_fault() {
11) 0.330 us | pmd_devmap_trans_unstable();
11) | do_fault() {
11) | filemap_map_pages() {
11) | alloc_set_pte() {
11) 0.322 us | pmd_devmap_trans_unstable();
11) 0.332 us | _raw_spin_lock();
11) 1.027 us | page_add_file_rmap();
11) 3.002 us | }
11) 0.346 us | unlock_page();
11) | alloc_set_pte() {
11) 0.801 us | page_add_file_rmap();
11) 1.358 us | }
11) 0.282 us | unlock_page();
11) | alloc_set_pte() {
11) 0.791 us | page_add_file_rmap();
11) 1.336 us | }
11) 0.270 us | unlock_page();
11) 8.944 us | }
11) 9.706 us | }
11) + 11.120 us | }
11) + 12.799 us | }
11) 0.266 us | up_read();
11) + 17.076 us | }
11) + 17.596 us | }
11) + 19.061 us | }
11) | io_submit_one() {
11) | kmem_cache_alloc() {
11) 0.358 us | should_failslab();
11) 0.960 us | }
11) 0.334 us | __get_reqs_available();
11) | fget() {
11) 0.300 us | __fget();
11) 0.953 us | }
11) | aio_read() {
11) 0.755 us | aio_prep_rw();
11) 0.545 us | aio_setup_rw();
11) | rw_verify_area() {
11) | security_file_permission() {
11) | apparmor_file_permission() {
11) | common_file_perm() {
11) 0.285 us | aa_file_perm();
11) 0.866 us | }
11) 1.408 us | }
11) 0.292 us | __fsnotify_parent();
11) 0.319 us | fsnotify();
11) 3.108 us | }
11) 3.707 us | }
11) | generic_file_read_iter() {
11) | filemap_write_and_wait_range() {
11) 0.392 us | __filemap_fdatawrite_range();
11) | __filemap_fdatawait_range() {
11) | pagevec_lookup_range_tag() {
11) 0.760 us | find_get_pages_range_tag();
11) 1.475 us | }
11) 2.040 us | }
11) 0.313 us | filemap_check_errors();
11) 4.029 us | }
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.263 us | ktime_get_coarse_real_ts64();
11) 0.266 us | timespec64_trunc();
11) 1.393 us | }
11) 1.980 us | }
11) 2.514 us | }
11) 0.690 us | btrfs_direct_IO [btrfs]();
11) | _cond_resched() {
11) 0.335 us | rcu_all_qs();
11) 0.859 us | }
11) | pagecache_get_page() {
11) 0.574 us | find_get_entry();
11) 2.355 us | }
11) 0.340 us | mark_page_accessed();
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.268 us | ktime_get_coarse_real_ts64();
11) 0.268 us | timespec64_trunc();
11) 1.318 us | }
11) 1.876 us | }
11) 2.436 us | }
11) + 17.043 us | }
11) 0.429 us | aio_complete_rw();
11) 0.295 us | kfree();
11) + 25.152 us | }
11) 0.276 us | _raw_spin_lock_irqsave();
11) 0.274 us | _raw_spin_unlock_irqrestore();
11) | fput() {
11) 0.287 us | fput_many();
11) 0.786 us | }
11) 0.547 us | kmem_cache_free();
11) + 32.749 us | }
11) | io_submit_one() {
11) | kmem_cache_alloc() {
11) 0.266 us | should_failslab();
11) 0.799 us | }
11) 0.296 us | __get_reqs_available();
11) | fget() {
11) 0.328 us | __fget();
11) 0.823 us | }
11) | aio_read() {
11) 0.366 us | aio_prep_rw();
11) 0.294 us | aio_setup_rw();
11) | rw_verify_area() {
11) | security_file_permission() {
11) | apparmor_file_permission() {
11) | common_file_perm() {
11) 0.330 us | aa_file_perm();
11) 0.859 us | }
11) 1.359 us | }
11) 0.276 us | __fsnotify_parent();
11) 0.285 us | fsnotify();
11) 3.073 us | }
11) 3.628 us | }
11) | generic_file_read_iter() {
11) | filemap_write_and_wait_range() {
11) 0.281 us | __filemap_fdatawrite_range();
11) | __filemap_fdatawait_range() {
11) | pagevec_lookup_range_tag() {
11) 0.401 us | find_get_pages_range_tag();
11) 0.909 us | }
11) 1.440 us | }
11) 0.273 us | filemap_check_errors();
11) 3.073 us | }
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.268 us | ktime_get_coarse_real_ts64();
11) 0.268 us | timespec64_trunc();
11) 1.388 us | }
11) 1.974 us | }
11) 2.546 us | }
11) 0.311 us | btrfs_direct_IO [btrfs]();
11) 7.110 us | }
11) 0.344 us | aio_complete_rw();
11) 0.308 us | kfree();
11) + 14.100 us | }
11) 0.322 us | _raw_spin_lock_irqsave();
11) 0.304 us | refill_reqs_available();
11) 0.344 us | _raw_spin_unlock_irqrestore();
11) | fput() {
11) 0.265 us | fput_many();
11) 0.804 us | }
11) 0.293 us | kmem_cache_free();
11) + 21.363 us | }
11) + 76.921 us | }
gcc -D_GNU_SOURCE my_aio.c
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
11) | __x64_sys_io_submit() {
11) | lookup_ioctx() {
11) | do_page_fault() {
11) | __do_page_fault() {
11) 0.307 us | down_read_trylock();
11) | find_vma() {
11) 0.433 us | vmacache_find();
11) 0.270 us | vmacache_update();
11) 1.626 us | }
11) | handle_mm_fault() {
11) 0.319 us | mem_cgroup_from_task();
11) 0.264 us | __count_memcg_events();
11) | __handle_mm_fault() {
11) 0.330 us | pmd_devmap_trans_unstable();
11) | do_fault() {
11) | filemap_map_pages() {
11) | alloc_set_pte() {
11) 0.322 us | pmd_devmap_trans_unstable();
11) 0.332 us | _raw_spin_lock();
11) 1.027 us | page_add_file_rmap();
11) 3.002 us | }
11) 0.346 us | unlock_page();
11) | alloc_set_pte() {
11) 0.801 us | page_add_file_rmap();
11) 1.358 us | }
11) 0.282 us | unlock_page();
11) | alloc_set_pte() {
11) 0.791 us | page_add_file_rmap();
11) 1.336 us | }
11) 0.270 us | unlock_page();
11) 8.944 us | }
11) 9.706 us | }
11) + 11.120 us | }
11) + 12.799 us | }
11) 0.266 us | up_read();
11) + 17.076 us | }
11) + 17.596 us | }
11) + 19.061 us | }
11) | io_submit_one() {
11) | kmem_cache_alloc() {
11) 0.358 us | should_failslab();
11) 0.960 us | }
11) 0.334 us | __get_reqs_available();
11) | fget() {
11) 0.300 us | __fget();
11) 0.953 us | }
11) | aio_read() {
11) 0.755 us | aio_prep_rw();
11) 0.545 us | aio_setup_rw();
11) | rw_verify_area() {
11) | security_file_permission() {
11) | apparmor_file_permission() {
11) | common_file_perm() {
11) 0.285 us | aa_file_perm();
11) 0.866 us | }
11) 1.408 us | }
11) 0.292 us | __fsnotify_parent();
11) 0.319 us | fsnotify();
11) 3.108 us | }
11) 3.707 us | }
11) | generic_file_read_iter() {
11) | filemap_write_and_wait_range() {
11) 0.392 us | __filemap_fdatawrite_range();
11) | __filemap_fdatawait_range() {
11) | pagevec_lookup_range_tag() {
11) 0.760 us | find_get_pages_range_tag();
11) 1.475 us | }
11) 2.040 us | }
11) 0.313 us | filemap_check_errors();
11) 4.029 us | }
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.263 us | ktime_get_coarse_real_ts64();
11) 0.266 us | timespec64_trunc();
11) 1.393 us | }
11) 1.980 us | }
11) 2.514 us | }
11) 0.690 us | btrfs_direct_IO [btrfs]();
11) | _cond_resched() {
11) 0.335 us | rcu_all_qs();
11) 0.859 us | }
11) | pagecache_get_page() {
11) 0.574 us | find_get_entry();
11) 2.355 us | }
11) 0.340 us | mark_page_accessed();
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.268 us | ktime_get_coarse_real_ts64();
11) 0.268 us | timespec64_trunc();
11) 1.318 us | }
11) 1.876 us | }
11) 2.436 us | }
11) + 17.043 us | }
11) 0.429 us | aio_complete_rw();
11) 0.295 us | kfree();
11) + 25.152 us | }
11) 0.276 us | _raw_spin_lock_irqsave();
11) 0.274 us | _raw_spin_unlock_irqrestore();
11) | fput() {
11) 0.287 us | fput_many();
11) 0.786 us | }
11) 0.547 us | kmem_cache_free();
11) + 32.749 us | }
11) | io_submit_one() {
11) | kmem_cache_alloc() {
11) 0.266 us | should_failslab();
11) 0.799 us | }
11) 0.296 us | __get_reqs_available();
11) | fget() {
11) 0.328 us | __fget();
11) 0.823 us | }
11) | aio_read() {
11) 0.366 us | aio_prep_rw();
11) 0.294 us | aio_setup_rw();
11) | rw_verify_area() {
11) | security_file_permission() {
11) | apparmor_file_permission() {
11) | common_file_perm() {
11) 0.330 us | aa_file_perm();
11) 0.859 us | }
11) 1.359 us | }
11) 0.276 us | __fsnotify_parent();
11) 0.285 us | fsnotify();
11) 3.073 us | }
11) 3.628 us | }
11) | generic_file_read_iter() {
11) | filemap_write_and_wait_range() {
11) 0.281 us | __filemap_fdatawrite_range();
11) | __filemap_fdatawait_range() {
11) | pagevec_lookup_range_tag() {
11) 0.401 us | find_get_pages_range_tag();
11) 0.909 us | }
11) 1.440 us | }
11) 0.273 us | filemap_check_errors();
11) 3.073 us | }
11) | touch_atime() {
11) | atime_needs_update() {
11) | current_time() {
11) 0.268 us | ktime_get_coarse_real_ts64();
11) 0.268 us | timespec64_trunc();
11) 1.388 us | }
11) 1.974 us | }
11) 2.546 us | }
11) 0.311 us | btrfs_direct_IO [btrfs]();
11) 7.110 us | }
11) 0.344 us | aio_complete_rw();
11) 0.308 us | kfree();
11) + 14.100 us | }
11) 0.322 us | _raw_spin_lock_irqsave();
11) 0.304 us | refill_reqs_available();
11) 0.344 us | _raw_spin_unlock_irqrestore();
11) | fput() {
11) 0.265 us | fput_many();
11) 0.804 us | }
11) 0.293 us | kmem_cache_free();
11) + 21.363 us | }
11) + 76.921 us | }
No comments:
Post a Comment