Linux kernel talks about hugepages in their documentation :
https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
Using the mount command we can see the hugetlbfs mounted :
hugetlbfs on /dev/hugepages type hugetlbfs (rw,relatime,seclabel)
the hugetlbfs filesystem is mounted by linux on boot.
The available hugepages can be seen using :
cat /proc/meminfo
...
...
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 2048 kB
...
...
We can add the hugepages using the following command :
echo 32 > /proc/sys/vm/nr_hugepages
OR sysctl -w vm.nr_hugepages=xx
...
...
HugePages_Total: 32
HugePages_Free: 31
HugePages_Rsvd: 1
HugePages_Surp: 0
Hugepagesize: 2048 kB
...
...
Post addition we can use the Huge pages using following C program :
#include<stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
int main ()
{
int fd = -1,i;
char * buffer = NULL;
int buffer_size = 1024;
fd = open("/dev/hugepages/my_map", O_CREAT | O_RDWR, 0755);
buffer = (void *)mmap(0, buffer_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
printf("mmapped the buffer = %p\n", buffer);
for (i=0; i<1000;i++)
buffer[i] = 'S';
return 0;
}
Now lets see what happens internally for these steps :
========================================================================
Adding the huge pages :
========================================================================
We can add the hugepages using the following command :
echo 32 > /proc/sys/vm/nr_hugepages
sysctl -w vm.nr_hugepages=xx
As a result of these steps lets see how the huge pages are allocated :
The hugetlb_sysctl_handler function is called when we echo to /proc/sys/vm/nr_hugepages
cd /sys/kernel/debug/tracing
cat /dev/null > trace
echo hugetlb_sysctl_handler > set_graph_function
echo 10 > max_graph_depth
echo function_graph > current_tracer
echo 1 > tracing_on
echo 32 > /proc/sys/vm/nr_hugepages
sleep 1
cp trace ~/trace_nr_hugepages_f_graph
echo 0 > tracing_on
echo > set_graph_function
echo 0 > max_graph_depth
cat /dev/null > trace
hugetlb_sysctl_handler calls hugetlb_sysctl_handler_common
which eventually calls set_max_huge_pages
set_max_huge_pages calls alloc_fresh_huge_page 32 times
alloc_fresh_huge_page_node uses alloc_pages to allocate memory from the buddy allocator
also updates internal stats using prep_new_huge_page.
2) | hugetlb_sysctl_handler() {
2) | hugetlb_sysctl_handler_common() {
...
...
2) | set_max_huge_pages() {
2) 0.056 us | _raw_spin_lock();
2) 0.044 us | _cond_resched();
2) | alloc_fresh_huge_page() {
2) | hstate_next_node_to_alloc.isra.47() {
2) 0.038 us | get_valid_node_allowed();
2) 0.187 us | next_node_allowed();
2) 1.272 us | }
2) | __alloc_pages_nodemask() {
2) 0.038 us | _cond_resched();
...
...
2) | prep_new_huge_page() {
2) 0.039 us | _raw_spin_lock();
2) | put_page() {
...
...
========================================================================
MMAP of huge page
========================================================================
Inode.c :
const struct file_operations hugetlbfs_file_operations = {
.read = hugetlbfs_read,
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
.llseek = default_llseek,
.fallocate = hugetlbfs_fallocate,
};
When mmap of a file is called in hugetlbfs hugetlbfs_file_mmap function is called.
Any mmap creates a VMA.
While the VMA is created :
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
vma->vm_ops = &hugetlb_vm_ops;
Length of the VMA is fetched
len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
}
3) | hugetlbfs_file_mmap() {
3) | mutex_lock() {
3) 0.242 us | _cond_resched();
3) 0.773 us | }
3) | touch_atime() {
3) | current_fs_time() {
3) 0.043 us | current_kernel_time();
3) 0.409 us | }
3) 0.904 us | }
3) | hugetlb_reserve_pages() {
3) | region_chg() {
3) 0.100 us | _raw_spin_lock();
3) | kmem_cache_alloc_trace() {
3) 0.035 us | _cond_resched();
3) 0.369 us | }
3) 0.034 us | _raw_spin_lock();
3) 1.643 us | }
3) 0.104 us | hugepage_subpool_get_pages();
3) | hugetlb_acct_memory() {
3) 0.035 us | _raw_spin_lock();
3) 0.650 us | }
3) | region_add() {
3) 0.035 us | _raw_spin_lock();
3) 0.373 us | }
3) 4.172 us | }
3) 0.046 us | mutex_unlock();
3) 7.342 us | }
========================================================================
Fault handling of hugepage:
========================================================================
When we write to the mmapped area the fault handler is called. If the fault is on the hugetlb page hugetlb_fault is called.
__handle_mm_fault calls :
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
hugetlb_fault uses huge_pte_alloc to allocate a page table entry. It will internally to __pud_alloc and pmd_alloc
It moves ahead and calls hugetlb_no_page this calls alloc_huge_page.
alloc_huge_page may call __alloc_buddy_huge_page_with_mpol for further allocation. __hugetlb_alloc_buddy_huge_page calls alloc_pages_node
to alloc pages from normal buddy allocator.
The page allocated is returned and passed to make_huge_pte. Here pte_mkhuge sets _PAGE_PSE flag in the pte.
for more about the page size extension please go through following link.
https://en.wikipedia.org/wiki/Page_Size_Extension
cd /sys/kernel/debug/tracing
cat /dev/null > trace
echo hugetlb_fault > set_graph_function
echo 10 > max_graph_depth
echo function_graph > current_tracer
echo 1 > tracing_on
~/./a.out
sleep 1
cp trace ~/trace_hugetlb_fault_graph
echo 0 > tracing_on
echo > set_graph_function
echo 0 > max_graph_depth
cat /dev/null > trace
3) | hugetlb_fault() {
3) 0.127 us | huge_pte_offset();
3) | huge_pte_alloc() {
3) | __pud_alloc() {
3) | get_zeroed_page() {
3) | __get_free_pages() {
3) | alloc_pages_current() {
...
3) 5.428 us | }
3) 0.035 us | _raw_spin_lock();
3) 6.151 us | }
3) | huge_pmd_share() {
3) 0.052 us | find_vma();
3) | __pmd_alloc() {
3) | alloc_pages_current() {
...
...
3) 5.012 us | }
3) 5.887 us | }
3) + 14.981 us | }
3) 0.149 us | hugetlb_fault_mutex_hash();
3) | mutex_lock() {
3) 0.035 us | _cond_resched();
3) 0.483 us | }
3) | find_lock_page() {
3) | __find_lock_page() {
3) 0.080 us | __find_get_page();
3) 0.383 us | }
3) 0.675 us | }
3) | alloc_huge_page() {
3) | __vma_reservation_common() {
3) | region_chg() {
3) 0.055 us | _raw_spin_lock();
3) 0.035 us | kfree();
3) 0.738 us | }
3) 1.240 us | {
...
...
3) | region_add() {
3) 0.042 us | _raw_spin_lock();
3) 0.418 us | }
3) 0.767 us | }
3) + 11.440 us | }
3) | clear_huge_page() {
3) | huge_add_to_page_cache() {
3) | add_to_page_cache_locked() {
3) | __add_to_page_cache_locked() {
...
...
3) 0.035 us | page_waitqueue();
3) 0.045 us | __wake_up_bit();
3) 1.258 us | }
3) 0.109 us | mutex_unlock();
3) ! 1542.838 us | }
No comments:
Post a Comment