7.10.2. linux arm64 head.S详解

首先贴出head.S中主要的代码段

    __INIT

    /*
     * The following callee saved general purpose registers are used on the
     * primary lowlevel boot path:
     *
     *  Register   Scope                      Purpose
     *  x21        stext() .. start_kernel()  FDT pointer passed at boot in x0
     *  x23        stext() .. start_kernel()  physical misalignment/KASLR offset
     *  x28        __create_page_tables()     callee preserved temp register
     *  x19/x20    __primary_switch()         callee preserved temp registers
     *  x24        __primary_switch() .. relocate_kernel()
     *                                        current RELR displacement
     */
ENTRY(stext)
    bl      preserve_boot_args
    bl      el2_setup                       // Drop to EL1, w0=cpu_boot_mode
    adrp    x23, __PHYS_OFFSET
    and     x23, x23, MIN_KIMG_ALIGN - 1    // KASLR offset, defaults to 0
    bl      set_cpu_boot_mode_flag
    bl      __create_page_tables
    /*
     * The following calls CPU setup code, see arch/arm64/mm/proc.S for
     * details.
     * On return, the CPU will be ready for the MMU to be turned on and
     * the TCR will have been set.
     */
    bl      __cpu_setup                     // initialise processor
    b       __primary_switch
ENDPROC(stext)

这几个函数为head.S中主要函数,一次执行,最后通过__primary_switch进入start_kernel函数开始C语言代码的执行

7.10.2.1. preserve_boot_args

/*
 * Preserve the arguments passed by the bootloader in x0 .. x3
 */
preserve_boot_args:
    mov     x21, x0                         // x21=FDT

    adr_l   x0, boot_args                   // record the contents of
    stp     x21, x1, [x0]                   // x0 .. x3 at kernel entry
    stp     x2, x3, [x0, #16]

    dmb     sy                              // needed before dc ivac with
                        // MMU off

    mov     x1, #0x20                       // 4 x 8 bytes
    b       __inval_dcache_area             // tail call
ENDPROC(preserve_boot_args)

bootloader把设备树的首地址赋值给了通用寄存器X0,因此现在X21保存着FDT的物理地址,用于后面的调用,同时也将x0寄存器腾了出来. 随后将x0~x3的值保存在boot_args标签所代表 的地址空间内,并使用用dmb sy设置指令屏障, 随后调用__inval_dcache_area将该片内存中的cache使无效

 /*
 * The recorded values of x0 .. x3 upon kernel entry.
 */
u64 __cacheline_aligned boot_args[4];

解释一下adr_l这个宏,该宏的含义是将boot_args标签的物理地址赋给了x0。adr_l宏最终会调用adrp指令,该指令的作用就是将符号地址变为运行地址,(vxlinux.lds.S中定义的标签地址都是 虚拟地址) ,运行时地址在目前情况下就是物理地址(此时MMU还没有打开)

7.10.2.2. el2_setup

ARMv8中有exception level的概念,即EL0~EL3一共4个level,这个概念代替了以往的普通模式,特权模式的概念。用户态所使用的app处于特权的最低等级EL0,内核OS运行于EL1层级,EL2则被用于 虚拟化的应用,提供Security支持的seurity monitor位于EL3

SPsel(stack pointer select)寄存器作用官方描述为Allow the Stack Pointer to be selected between SP_EL0 and SP_ELX.

currentEL,可以通过它获取当前所处的exception level

adrp    x23, __PHYS_OFFSET
and x23, x23, MIN_KIMG_ALIGN - 1    // KASLR offset, defaults to 0

#define __PHYS_OFFSET   (KERNEL_START - TEXT_OFFSET)

#define KERNEL_START    _text
#define KERNEL_END              _end

/*
 * arm64 requires the kernel image to placed
 * TEXT_OFFSET bytes beyond a 2 MB aligned base
 */
#define MIN_KIMG_ALIGN          SZ_2M

这两行代码时找到kernel起始的物理地址保存于x23(2M大小对齐),注意__PHYS_OFFSET是虚拟地址空间的值

7.10.2.3. set_cpu_boot_mode_flag

/*
 * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
 * in w0. See arch/arm64/include/asm/virt.h for more info.
 */
set_cpu_boot_mode_flag:
        adr_l   x1, __boot_cpu_mode
        cmp     w0, #BOOT_CPU_MODE_EL2
        b.ne    1f
        add     x1, x1, #4
1:      str     w0, [x1]                        // This CPU has booted in EL1
        dmb     sy
        dc      ivac, x1                        // Invalidate potentially stale cache line
        ret
ENDPROC(set_cpu_boot_mode_flag)

/*
 * We need to find out the CPU boot mode long after boot, so we need to
 * store it in a writable variable.
 *
 * This is not in .bss, because we set it sufficiently early that the boot-time
 * zeroing of .bss would clobber it.
 */
ENTRY(__boot_cpu_mode)
        .long   BOOT_CPU_MODE_EL2
        .long   BOOT_CPU_MODE_EL1

该函数也相对简单,目的很明确,就是设置__boot_cpu_mode的值。在前面的el2_setup函数中,level的值已经保存在w0中了。由于系统启动之后仍然需要了解cpu启动时候的exception level ,因此有一个全局变量__boot_cpu_mode用来保存启动时候的CPU mode

如果启动的时候是EL1 mode,会修改变量__boot_cpu_mode A域,将其修改为BOOT_CPU_MODE_EL1

如果启动的时候是EL2 mode,会修改变量__boot_cpu_mode B域,将其修改为BOOT_CPU_MODE_EL2

7.10.2.4. __create_page_tables

这是一段比较关键的代码,目的是建立页表,开启MMU

/*
 * Setup the initial page tables. We only setup the barest amount which is
 * required to get the kernel running. The following sections are required:
 *   - identity mapping to enable the MMU (low address, TTBR0)
 *   - first few MB of the kernel linear mapping to jump to once the MMU has
 *     been enabled
 */
__create_page_tables:
        mov     x28, lr

        /*
         * Invalidate the init page tables to avoid potential dirty cache lines
         * being evicted. Other page tables are allocated in rodata as part of
         * the kernel image, and thus are clean to the PoC per the boot
         * protocol.
         */
        adrp    x0, init_pg_dir
        adrp    x1, init_pg_end
        sub     x1, x1, x0
        bl      __inval_dcache_area

        /*
         * Clear the init page tables.
         */
        adrp    x0, init_pg_dir
        adrp    x1, init_pg_end
        sub     x1, x1, x0
1:      stp     xzr, xzr, [x0], #16
        stp     xzr, xzr, [x0], #16
        stp     xzr, xzr, [x0], #16
        stp     xzr, xzr, [x0], #16
        subs    x1, x1, #64
        b.ne    1b

        mov     x7, SWAPPER_MM_MMUFLAGS

        /*
         * Create the identity mapping.
         */
        adrp    x0, idmap_pg_dir
        adrp    x3, __idmap_text_start          // __pa(__idmap_text_start)

#ifdef CONFIG_ARM64_VA_BITS_52
        mrs_s   x6, SYS_ID_AA64MMFR2_EL1
        and     x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
        mov     x5, #52
        cbnz    x6, 1f
#endif
        mov     x5, #VA_BITS_MIN
1:
        adr_l   x6, vabits_actual
        str     x5, [x6]
        dmb     sy
        dc      ivac, x6                // Invalidate potentially stale cache line

        /*
         * VA_BITS may be too small to allow for an ID mapping to be created
         * that covers system RAM if that is located sufficiently high in the
         * physical address space. So for the ID map, use an extended virtual
         * range in that case, and configure an additional translation level
         * if needed.
         *
         * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
         * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
         * this number conveniently equals the number of leading zeroes in
         * the physical address of __idmap_text_end.
         */
        adrp    x5, __idmap_text_end
        clz     x5, x5
        cmp     x5, TCR_T0SZ(VA_BITS)   // default T0SZ small enough?
        b.ge    1f                      // .. then skip VA range extension

        adr_l   x6, idmap_t0sz
        str     x5, [x6]
        dmb     sy
        dc      ivac, x6                // Invalidate potentially stale cache line

#if (VA_BITS < 48)
#define EXTRA_SHIFT     (PGDIR_SHIFT + PAGE_SHIFT - 3)
#define EXTRA_PTRS      (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))

        /*
         * If VA_BITS < 48, we have to configure an additional table level.
         * First, we have to verify our assumption that the current value of
         * VA_BITS was chosen such that all translation levels are fully
         * utilised, and that lowering T0SZ will always result in an additional
         * translation level to be configured.
         */
#if VA_BITS != EXTRA_SHIFT
#error "Mismatch between VA_BITS and page size/number of translation levels"
#endif

        mov     x4, EXTRA_PTRS
        create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6
#else
        /*
         * If VA_BITS == 48, we don't have to configure an additional
         * translation level, but the top-level table has more entries.
         */
        mov     x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
        str_l   x4, idmap_ptrs_per_pgd, x5
#endif
1:
        ldr_l   x4, idmap_ptrs_per_pgd
        mov     x5, x3                          // __pa(__idmap_text_start)
        adr_l   x6, __idmap_text_end            // __pa(__idmap_text_end)

        map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14

        /*
         * Map the kernel image (starting with PHYS_OFFSET).
         */
        adrp    x0, init_pg_dir
        mov_q   x5, KIMAGE_VADDR + TEXT_OFFSET  // compile time __va(_text)
        add     x5, x5, x23                     // add KASLR displacement
        mov     x4, PTRS_PER_PGD
        adrp    x6, _end                        // runtime __pa(_end)
        adrp    x3, _text                       // runtime __pa(_text)
        sub     x6, x6, x3                      // _end - _text
        add     x6, x6, x5                      // runtime __va(_end)

        map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14

        /*
         * Since the page tables have been populated with non-cacheable
         * accesses (MMU disabled), invalidate the idmap and swapper page
         * tables again to remove any speculatively loaded cache lines.
         */
        adrp    x0, idmap_pg_dir
        adrp    x1, init_pg_end
        sub     x1, x1, x0
        dmb     sy
        bl      __inval_dcache_area

        ret     x28
ENDPROC(__create_page_tables)

主要包含两个部分:

  1. 为MMU的开启而准备的一致性映射identity mapping

  2. 为内核的执行建立覆盖整个内核代码空间的线性映射swapper

什么是一致性映射?我们知道在开启MMU之前,代码执行所用的地址都是物理地址,而在开启MMU之后则使用虚拟地址,通过translation table进行转换,最终才得以访问物理memory。 也就是说在MMU开启的前后所使用的地址空间是不一样的

………(这部分待完善,MMU部分看不明白)

7.10.2.5. __cpu_setup

…….这一段也跳过,日后再补

7.10.2.6. __primary_switch

__primary_switch:
#ifdef CONFIG_RANDOMIZE_BASE
        mov     x19, x0                         // preserve new SCTLR_EL1 value
        mrs     x20, sctlr_el1                  // preserve old SCTLR_EL1 value
#endif

        adrp    x1, init_pg_dir
        bl      __enable_mmu
#ifdef CONFIG_RELOCATABLE
#ifdef CONFIG_RELR
        mov     x24, #0                         // no RELR displacement yet
#endif
        bl      __relocate_kernel
#ifdef CONFIG_RANDOMIZE_BASE
        ldr     x8, =__primary_switched
        adrp    x0, __PHYS_OFFSET
        blr     x8

        /*
         * If we return here, we have a KASLR displacement in x23 which we need
         * to take into account by discarding the current kernel mapping and
         * creating a new one.
         */
        pre_disable_mmu_workaround
        msr     sctlr_el1, x20                  // disable the MMU
        isb
        bl      __create_page_tables            // recreate kernel mapping

        tlbi    vmalle1                         // Remove any stale TLB entries
        dsb     nsh

        msr     sctlr_el1, x19                  // re-enable the MMU
        isb
        ic      iallu                           // flush instructions fetched
        dsb     nsh                             // via old mapping
        isb

        bl      __relocate_kernel
#endif
#endif
        ldr     x8, =__primary_switched
        adrp    x0, __PHYS_OFFSET
        br      x8
ENDPROC(__primary_switch)

要进入start_kernel这样的C代码,没有stack可不行。用户空间的进程陷入内核态的时候,stack切换到内核栈,实际上就是该进程的thread info内存段(4k或8k)的顶部。 init_thread_union就是0号进程swapper的thread info内存段, add sp, x4, #THREAD_SIZE 指向了栈顶