linux arm64 head.S详解 ======================= 首先贴出head.S中主要的代码段 :: __INIT /* * The following callee saved general purpose registers are used on the * primary lowlevel boot path: * * Register Scope Purpose * x21 stext() .. start_kernel() FDT pointer passed at boot in x0 * x23 stext() .. start_kernel() physical misalignment/KASLR offset * x28 __create_page_tables() callee preserved temp register * x19/x20 __primary_switch() callee preserved temp registers * x24 __primary_switch() .. relocate_kernel() * current RELR displacement */ ENTRY(stext) bl preserve_boot_args bl el2_setup // Drop to EL1, w0=cpu_boot_mode adrp x23, __PHYS_OFFSET and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag bl __create_page_tables /* * The following calls CPU setup code, see arch/arm64/mm/proc.S for * details. * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ bl __cpu_setup // initialise processor b __primary_switch ENDPROC(stext) 这几个函数为head.S中主要函数,一次执行,最后通过__primary_switch进入start_kernel函数开始C语言代码的执行 preserve_boot_args ^^^^^^^^^^^^^^^^^^^ :: /* * Preserve the arguments passed by the bootloader in x0 .. x3 */ preserve_boot_args: mov x21, x0 // x21=FDT adr_l x0, boot_args // record the contents of stp x21, x1, [x0] // x0 .. x3 at kernel entry stp x2, x3, [x0, #16] dmb sy // needed before dc ivac with // MMU off mov x1, #0x20 // 4 x 8 bytes b __inval_dcache_area // tail call ENDPROC(preserve_boot_args) bootloader把设备树的首地址赋值给了通用寄存器X0,因此现在X21保存着FDT的物理地址,用于后面的调用,同时也将x0寄存器腾了出来. 随后将x0~x3的值保存在boot_args标签所代表 的地址空间内,并使用用dmb sy设置指令屏障, 随后调用__inval_dcache_area将该片内存中的cache使无效 :: /* * The recorded values of x0 .. x3 upon kernel entry. */ u64 __cacheline_aligned boot_args[4]; 解释一下adr_l这个宏,该宏的含义是将boot_args标签的物理地址赋给了x0。adr_l宏最终会调用adrp指令,该指令的作用就是将符号地址变为运行地址,(vxlinux.lds.S中定义的标签地址都是 虚拟地址) ,运行时地址在目前情况下就是物理地址(此时MMU还没有打开) el2_setup ^^^^^^^^^^^ ARMv8中有exception level的概念,即EL0~EL3一共4个level,这个概念代替了以往的普通模式,特权模式的概念。用户态所使用的app处于特权的最低等级EL0,内核OS运行于EL1层级,EL2则被用于 虚拟化的应用,提供Security支持的seurity monitor位于EL3 SPsel(stack pointer select)寄存器作用官方描述为Allow the Stack Pointer to be selected between SP_EL0 and SP_ELX. currentEL,可以通过它获取当前所处的exception level :: adrp x23, __PHYS_OFFSET and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 #define __PHYS_OFFSET (KERNEL_START - TEXT_OFFSET) #define KERNEL_START _text #define KERNEL_END _end /* * arm64 requires the kernel image to placed * TEXT_OFFSET bytes beyond a 2 MB aligned base */ #define MIN_KIMG_ALIGN SZ_2M 这两行代码时找到kernel起始的物理地址保存于x23(2M大小对齐),注意__PHYS_OFFSET是虚拟地址空间的值 set_cpu_boot_mode_flag ^^^^^^^^^^^^^^^^^^^^^^^^ :: /* * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed * in w0. See arch/arm64/include/asm/virt.h for more info. */ set_cpu_boot_mode_flag: adr_l x1, __boot_cpu_mode cmp w0, #BOOT_CPU_MODE_EL2 b.ne 1f add x1, x1, #4 1: str w0, [x1] // This CPU has booted in EL1 dmb sy dc ivac, x1 // Invalidate potentially stale cache line ret ENDPROC(set_cpu_boot_mode_flag) /* * We need to find out the CPU boot mode long after boot, so we need to * store it in a writable variable. * * This is not in .bss, because we set it sufficiently early that the boot-time * zeroing of .bss would clobber it. */ ENTRY(__boot_cpu_mode) .long BOOT_CPU_MODE_EL2 .long BOOT_CPU_MODE_EL1 该函数也相对简单,目的很明确,就是设置__boot_cpu_mode的值。在前面的el2_setup函数中,level的值已经保存在w0中了。由于系统启动之后仍然需要了解cpu启动时候的exception level ,因此有一个全局变量__boot_cpu_mode用来保存启动时候的CPU mode 如果启动的时候是EL1 mode,会修改变量__boot_cpu_mode A域,将其修改为BOOT_CPU_MODE_EL1 如果启动的时候是EL2 mode,会修改变量__boot_cpu_mode B域,将其修改为BOOT_CPU_MODE_EL2 __create_page_tables ^^^^^^^^^^^^^^^^^^^^^ 这是一段比较关键的代码,目的是建立页表,开启MMU :: /* * Setup the initial page tables. We only setup the barest amount which is * required to get the kernel running. The following sections are required: * - identity mapping to enable the MMU (low address, TTBR0) * - first few MB of the kernel linear mapping to jump to once the MMU has * been enabled */ __create_page_tables: mov x28, lr /* * Invalidate the init page tables to avoid potential dirty cache lines * being evicted. Other page tables are allocated in rodata as part of * the kernel image, and thus are clean to the PoC per the boot * protocol. */ adrp x0, init_pg_dir adrp x1, init_pg_end sub x1, x1, x0 bl __inval_dcache_area /* * Clear the init page tables. */ adrp x0, init_pg_dir adrp x1, init_pg_end sub x1, x1, x0 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 subs x1, x1, #64 b.ne 1b mov x7, SWAPPER_MM_MMUFLAGS /* * Create the identity mapping. */ adrp x0, idmap_pg_dir adrp x3, __idmap_text_start // __pa(__idmap_text_start) #ifdef CONFIG_ARM64_VA_BITS_52 mrs_s x6, SYS_ID_AA64MMFR2_EL1 and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT) mov x5, #52 cbnz x6, 1f #endif mov x5, #VA_BITS_MIN 1: adr_l x6, vabits_actual str x5, [x6] dmb sy dc ivac, x6 // Invalidate potentially stale cache line /* * VA_BITS may be too small to allow for an ID mapping to be created * that covers system RAM if that is located sufficiently high in the * physical address space. So for the ID map, use an extended virtual * range in that case, and configure an additional translation level * if needed. * * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the * entire ID map region can be mapped. As T0SZ == (64 - #bits used), * this number conveniently equals the number of leading zeroes in * the physical address of __idmap_text_end. */ adrp x5, __idmap_text_end clz x5, x5 cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? b.ge 1f // .. then skip VA range extension adr_l x6, idmap_t0sz str x5, [x6] dmb sy dc ivac, x6 // Invalidate potentially stale cache line #if (VA_BITS < 48) #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) #define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT)) /* * If VA_BITS < 48, we have to configure an additional table level. * First, we have to verify our assumption that the current value of * VA_BITS was chosen such that all translation levels are fully * utilised, and that lowering T0SZ will always result in an additional * translation level to be configured. */ #if VA_BITS != EXTRA_SHIFT #error "Mismatch between VA_BITS and page size/number of translation levels" #endif mov x4, EXTRA_PTRS create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 #else /* * If VA_BITS == 48, we don't have to configure an additional * translation level, but the top-level table has more entries. */ mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT) str_l x4, idmap_ptrs_per_pgd, x5 #endif 1: ldr_l x4, idmap_ptrs_per_pgd mov x5, x3 // __pa(__idmap_text_start) adr_l x6, __idmap_text_end // __pa(__idmap_text_end) map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 /* * Map the kernel image (starting with PHYS_OFFSET). */ adrp x0, init_pg_dir mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) add x5, x5, x23 // add KASLR displacement mov x4, PTRS_PER_PGD adrp x6, _end // runtime __pa(_end) adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 /* * Since the page tables have been populated with non-cacheable * accesses (MMU disabled), invalidate the idmap and swapper page * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir adrp x1, init_pg_end sub x1, x1, x0 dmb sy bl __inval_dcache_area ret x28 ENDPROC(__create_page_tables) 主要包含两个部分: 1) 为MMU的开启而准备的一致性映射identity mapping 2) 为内核的执行建立覆盖整个内核代码空间的线性映射swapper 什么是一致性映射?我们知道在开启MMU之前,代码执行所用的地址都是物理地址,而在开启MMU之后则使用虚拟地址,通过translation table进行转换,最终才得以访问物理memory。 也就是说在MMU开启的前后所使用的地址空间是不一样的 .........(这部分待完善,MMU部分看不明白) __cpu_setup ^^^^^^^^^^^^^ .......这一段也跳过,日后再补 __primary_switch ^^^^^^^^^^^^^^^^^^^ :: __primary_switch: #ifdef CONFIG_RANDOMIZE_BASE mov x19, x0 // preserve new SCTLR_EL1 value mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value #endif adrp x1, init_pg_dir bl __enable_mmu #ifdef CONFIG_RELOCATABLE #ifdef CONFIG_RELR mov x24, #0 // no RELR displacement yet #endif bl __relocate_kernel #ifdef CONFIG_RANDOMIZE_BASE ldr x8, =__primary_switched adrp x0, __PHYS_OFFSET blr x8 /* * If we return here, we have a KASLR displacement in x23 which we need * to take into account by discarding the current kernel mapping and * creating a new one. */ pre_disable_mmu_workaround msr sctlr_el1, x20 // disable the MMU isb bl __create_page_tables // recreate kernel mapping tlbi vmalle1 // Remove any stale TLB entries dsb nsh msr sctlr_el1, x19 // re-enable the MMU isb ic iallu // flush instructions fetched dsb nsh // via old mapping isb bl __relocate_kernel #endif #endif ldr x8, =__primary_switched adrp x0, __PHYS_OFFSET br x8 ENDPROC(__primary_switch) 要进入start_kernel这样的C代码,没有stack可不行。用户空间的进程陷入内核态的时候,stack切换到内核栈,实际上就是该进程的thread info内存段(4k或8k)的顶部。 init_thread_union就是0号进程swapper的thread info内存段, add sp, x4, #THREAD_SIZE 指向了栈顶