Sholck

不积跬步,无以至千里.不积小流,无以成江海

0%

linux模块初始化分析

模块接口分析

模块编译

在代码编译前宏展开时,需要进行条件编译,这需要gcc指定参数,而指定什么参数由Makefile来控制.
trace.c在编译时gcc中的参数中会带有 -D__KERNEL__, -DMODULE, -D__KBUILD_MODNAME=kmod_trace, 可以参考linux-likely学习,这用在之后的宏展开。

模块入口

不同的模块根据要求和实现有着不同的初始化顺序,内核模块的入口在include/linux/init.h中定义

内核模块入口
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#define pure_initcall(fn)               __define_initcall(fn, 0)    

#define core_initcall(fn) __define_initcall(fn, 1)
#define core_initcall_sync(fn) __define_initcall(fn, 1s)
#define postcore_initcall(fn) __define_initcall(fn, 2)
#define postcore_initcall_sync(fn) __define_initcall(fn, 2s)
#define arch_initcall(fn) __define_initcall(fn, 3)
#define arch_initcall_sync(fn) __define_initcall(fn, 3s)
#define subsys_initcall(fn) __define_initcall(fn, 4)
#define subsys_initcall_sync(fn) __define_initcall(fn, 4s)
#define fs_initcall(fn) __define_initcall(fn, 5)
#define fs_initcall_sync(fn) __define_initcall(fn, 5s)
#define rootfs_initcall(fn) __define_initcall(fn, rootfs)
#define device_initcall(fn) __define_initcall(fn, 6)
#define device_initcall_sync(fn) __define_initcall(fn, 6s)
#define late_initcall(fn) __define_initcall(fn, 7)
#define late_initcall_sync(fn) __define_initcall(fn, 7s)

动态模块的入口为include/linux/module.h中定义,因为在编译模块时会使用gcc指令,其中会包含-DMODULE,因此相当于走#ifdef MODULE

动态模块入口
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#define early_initcall(fn)              module_init(fn)
#define core_initcall(fn) module_init(fn)
#define core_initcall_sync(fn) module_init(fn)
#define postcore_initcall(fn) module_init(fn)
#define postcore_initcall_sync(fn) module_init(fn)
#define arch_initcall(fn) module_init(fn)
#define subsys_initcall(fn) module_init(fn)
#define subsys_initcall_sync(fn) module_init(fn)
#define fs_initcall(fn) module_init(fn)
#define fs_initcall_sync(fn) module_init(fn)
#define rootfs_initcall(fn) module_init(fn)
#define device_initcall(fn) module_init(fn)
#define device_initcall_sync(fn) module_init(fn)
#define late_initcall(fn) module_init(fn)
#define late_initcall_sync(fn) module_init(fn)

#define console_initcall(fn) module_init(fn)

/* Each module must use one module_init(). */
#define module_init(initfn) \
static inline initcall_t __maybe_unused __inittest(void) \
{ return initfn; } \
int init_module(void) __copy(initfn) \
__attribute__((alias(#initfn))); \
__CFI_ADDRESSABLE(init_module, __initdata);

/* This is only required if you want to be unloadable. */
#define module_exit(exitfn) \
static inline exitcall_t __maybe_unused __exittest(void) \
{ return exitfn; } \
void cleanup_module(void) __copy(exitfn) \
__attribute__((alias(#exitfn))); \
__CFI_ADDRESSABLE(cleanup_module, __exitdata);

#endif

入口扩展

内核模块入口扩展开是什么样的,比如fs_initcall(fn)

1
#define fs_initcall(fn)                 __define_initcall(fn, 5)

fs_initcall(tracer_init_tracefs)最后通过宏扩展为

1
static initcall_t   __initcall__kmod_trace__397_9768_tracer_init_tracefs5  __attribute__((__used__))  __attribute__((__section__(".initcall5.init"))) = tracer_init_tracefs; 

扩展过程如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
fs_initcall(tracer_init_tracefs);
__define_initcall(tracer_init_tracefs, 5)
___define_initcall(tracer_init_tracefs, 5, .initcal5)
__unique_initcall(tracer_init_tracefs, 5, .initcal5, __initcall_id(fn))
//-D__KBUILD_MODNAME=kmod_trace在编译时指定
//__COUNTER__ 为预定义宏 调用计数器,每次调用都会+1
// ___LINE__ 为预定义宏 ,代码在文件多少行

因为_unique_initcall的定义还有宏,那么此时的__unique_initcall 参数进行一次扩展再替换
__unique_initcall(tracer_init_tracefs, 5, .initcal5, __kmod_trace__397_9768_tracer_init_tracefs)

继续扩展
____define_initcall(tracer_init_tracefs, \
__initcall_stub(tracer_init_tracefs, __kmod_trace__397_9768_tracer_init_tracefs, 5), \
__initcall_name(initcall, __kmod_trace__397_9768_tracer_init_tracefs, 5), \
__initcall_section(.initcall5.init, __kmod_trace__397_9768_tracer_init_tracefs))

因为____define_initcall的定义还有宏,那么此时的____define_initcall 参数进行一次扩展再替换

____define_initcall(tracer_init_tracefs, \
tracer_init_tracefs, \
__initcall__kmod_trace__397_9768_tracer_init_tracefs5, \
.initcall5.init

//因为init.h包含#include <linux/compiler.h>, 而compiler.h包含 #include <linux/compiler_types.h>,gcc编译中包含了-D__KERNEL__,因此包含#include <linux/compiler_attributes.h>,故__used被定义为如下

#define __used __attribute__((__used__))

则最后扩展为
static initcall_t __initcall__kmod_trace__397_9768_tracer_init_tracefs5 __attribute__((__used__)) __attribute__((__section__(".initcall5.init"))) = tracer_init_tracefs;

宏扩展规则:

  1. 如果宏定义中有宏,则会先进行一次扩展,即参数的宏进行展开,之后进行替换 比如 #define Tag test; #define EXP(a, b) _EXP(a,b); 则EXP(Tag, First)首先扩展为_EXP(test, First)
  2. 如果宏定义中没有宏,则直接替换 #define _EXP(a,b) a##b, 则_EXP(test, First) 扩展为testFirst

注意如下, 扩展出来类似为.initcall5.init, 不是.initcall5 .init

1
2
#define __initcall_section(__sec, __iid)                        \    
#__sec ".init"

关于COUNTER ,见Common Predefined Macros,相当于计数器
关于attribute((used)),见 gcc 对used变量描述gcc 对used函数描述

1
2
3
4
5
6
7
unused >>可能不使用,不用警告
This attribute, attached to a variable or structure field, means that the variable or field is meant to be possibly unused. GCC does not produce a warning for this variable or field.

used >>作用与静态变量
This attribute, attached to a variable with static storage, means that the variable must be emitted even if it appears that the variable is not referenced.

When applied to a static data member of a C++ class template, the attribute also means that the member is instantiated if the class itself is instantiated.

unused: 实践发现在静态变量中,如果一个变量没有使用,会进行未使用提示,但是普通的全局变量则不会,可以通过attribute((unused))来消除警告,_attribute__((used))也可以消除
used:如果静态变量如果没有使用,但是目标文件依旧做保留,可以通过section查看

unused测试
1
2
3
4
5
6
7
8
9
static int  test_var = 1;

make编译提示:
/github/linux-driver/2-likely/likely.c:12:15: 警告:‘test_var’ defined but not used [-Wunused-variable]
static int test_var = 1;

Makefile中增加EXTRA_CFLAGS = -Wall -g -W,开启警告提示更多

static int __attribute__((__unused__)) test_var;则无任何test_var 的提示
used测试
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static int  test_var = 1; >>正常为data section下的,但是因为没有被使用过,所以目标文件中不存在

objdump -t likely.o | grep -n "test_var"

static int __attribute__((__used__)) test_var = 1; >>目标文件保留

2-likely git:(master) ✗ objdump -t likely.o | grep -n "test_var"
32:0000000000000000 l O .data 0000000000000004 test_var >>目标文件保留test_var"

指定所在section
static int test_var __used __section(".section_test") = 1;或者
static int test_var __attribute__((__used__)) __attribute__((__section__(".section_test"))) = 1;

➜ 2-likely git:(master) ✗ objdump -t likely.o | grep -n "test_var"
33:0000000000000000 l O .section_test 0000000000000004 test_var >>可以看到test_var保留在.section_test
__COUNTER__测试
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#define test_count(fn) \
int fn = __COUNTER__

test_count(a);
test_count(b);
test_count(c);

int main()
{
printf("test is %d\n", a);
printf("test is %d\n", b);
printf("test is %d\n", c);
return 0;
}

执行结果为:

1
2
3
test is 0
test is 1
test is 2

扩展分析

扩展结果为

1
static initcall_t   __initcall__kmod_trace__397_9768_tracer_init_tracefs5  __attribute__((__used__))  __attribute__((__section__(".initcall5.init"))) = tracer_init_tracefs; 

initcall_t 定义为一个普通的函数指针类型,这里申明了
__initcall__kmod_trace__397_9768_tracer_init_tracefs5这个函数指针变量,指向函数tracer_init_tracefs,section为.initcall5.init, 即使没有被引用过,也在目标文件做保留

1
2
3
4
➜  linux git:(master) ✗ objdump -t kernel/trace/trace.o | grep -n "tracer_init_tracefs"
120:0000000000000000 l .initcall5.init 0000000000000000 __initcall__kmod_trace__397_9768_tracer_init_tracefs5
121:0000000000000581 l F .init.text 00000000000002b4 tracer_init_tracefs >>__init导致 #define __init __section(".init.text") __cold __latent_entropy __noinitretpoline __nocfi
426:0000000000000010 l O .discard.addressable 0000000000000008 __UNIQUE_ID___addressable_tracer_init_tracefs398

section可以理解为内存中一块连续的区域,因此section 为.initcall5.init中的函数在内核中地址是连续的,可以查看symbol,比如System.map

1
2
3
4
5
6
7
8
9
10
ffffffff82ebb71c T __initcall5_start         >>section  .initcall5 内存开始
ffffffff82ebb71c t __initcall__kmod_nmi__312_102_nmi_warning_debugfs5
ffffffff82ebb720 t __initcall__kmod_microcode__275_891_save_microcode_in_initrd5
ffffffff82ebb724 t __initcall__kmod_hpet__211_1165_hpet_late_init5
ffffffff82ebb728 t __initcall__kmod_amd_nb__276_507_init_amd_nbs5
ffffffff82ebb72c t __initcall__kmod_resource__294_1914_iomem_init_inode5
ffffffff82ebb730 t __initcall__kmod_clocksource__211_1038_clocksource_done_booting5
ffffffff82ebb734 t __initcall__kmod_trace__397_9768_tracer_init_tracefs5 >>__initcall__kmod_trace__397_9768_tracer_init_tracefs5 内存地址

ffffffff82d5e986 t tracer_init_tracefs

也可以通过gdb 倒入vmlinux查看符号表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
(gdb) info address __initcall__kmod_trace__397_9768_tracer_init_tracefs5
Symbol "__initcall__kmod_trace__397_9768_tracer_init_tracefs5" is at 0xffffffff82ebb734 in a file compiled without debugging.
(gdb) info symbol 0xffffffff82ebb734
__initcall__kmod_trace__397_9768_tracer_init_tracefs5 in section .init.data >>奇怪?为什么在vmlinux中是.init.data section,而不是本身的.initcall5.init

(gdb) info address tracer_init_tracefs
Symbol "tracer_init_tracefs" is a function at address 0xffffffff82d5e986.
(gdb) info symbol tracer_init_tracefs
tracer_init_tracefs in section .init.text
(gdb) info symbol 0xffffffff82d5e986
tracer_init_tracefs in section .init.text

➜ linux git:(master) ✗ objdump -t vmlinux | grep -n "tracer_init_tracefs"
18786:ffffffff82ebb734 l .init.data 0000000000000000 __initcall__kmod_trace__397_9768_tracer_init_tracefs5 >>为什么变成.init.data呢?
18787:ffffffff82d5e986 l F .init.text 00000000000002b4 tracer_init_tracefs

>>但是vmlinux.o还是正常的
➜ linux git:(master) ✗ objdump -t vmlinux.o | grep -n "tracer_init_tracefs"
32887:0000000000000018 l .initcall5.init 0000000000000000 __initcall__kmod_trace__397_9768_tracer_init_tracefs5
32888:000000000002e986 l F .init.text 00000000000002b4 tracer_init_tracefs
33178:0000000000004c28 l O .discard.addressable 0000000000000008 __UNIQUE_ID___addressable_tracer_init_tracefs398

发现一个问题,即vmlinux中的__initcall__kmod_trace__397_9768_tracer_init_tracefs5 section 变为.init.data

检查vmlinux,既然作为一个not stripped 的ELF文件,肯定是带有section的,通过ojbdump解析

1
2
3
4
5
6
7
8
9
10
11
12
➜  linux git:(master) ✗ objdump -h vmlinux
vmlinux: 文件格式 elf64-x86-64

节:
Idx Name Size VMA LMA File off Algn
0 .text 00e020e0 ffffffff81000000 0000000001000000 00200000 2**12
CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
19 .init.text 0006c870 ffffffff82d30000 0000000002d30000 02130000 2**4
CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
21 .init.data 0011bfa0 ffffffff82da0000 0000000002da0000 021a0000 2**13
CONTENTS, ALLOC, LOAD, RELOC, DATA
...

发现并没有找到.initcall5.init section,思考:肯定是先编译出来的kernel/trace/trace.o,再整合出来的vmlinux,那么从单个.o到vmlinux,
函数的symbol肯定进行一个合并,而且section也会发现变化,这个整合section的规则是什么?因此开始研究第二部分

section整合规则

在之前Kbuild-Makefile的学习中了解到

  1. ld有自己的默认链接方式,也可以指定链接脚本进行链接
  2. 在链接vmlinux时会先链接.tmp_vmlinux.kallsyms1,
    链接参数为--script=./arch/x86/kernel/vmlinux.lds
  3. vmlinux.lds是vmlinux.lds.S通过gcc预处理而来,-E选项代表只进行预处理,不进行编译
    1
    "gcc -E -Wp,-MMD,arch/x86/kernel/.vmlinux.lds.d  -nostdinc -I./arch/x86/include -I./arch/x86/include/generated  -I./include -I./arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/compiler-version.h -include ./include/linux/kconfig.h -D__KERNEL__     -Ux86_64 -P -Ux86 -D__ASSEMBLY__ -DLINKER_SCRIPT -o arch/x86/kernel/vmlinux.lds arch/x86/kernel/vmlinux.lds.S"

因此重点关注

  1. vmlinux.lds.S如何进行预处理的? 预处理时针对内核进行了哪些动态的设置?

    预处理应该是进行一些注释的删除,文件包含,宏定义,条件编译
    动态设置包括:输出格式,函数入口,section的内存存放地址和整合规则
    下面是.init.data section的封装设置

    .init.data section的封装
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    //vmlinux.lds.S
    INIT_DATA_SECTION(16)

    //include/asm-generic/vmlinux.lds.h
    #define INIT_DATA_SECTION(initsetup_align) \
    .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { \
    INIT_DATA \
    INIT_SETUP(initsetup_align) \
    INIT_CALLS \
    CON_INITCALL \
    INIT_RAM_FS \
    }

    #define INIT_CALLS_LEVEL(level) \
    __initcall##level##_start = .; \
    KEEP(*(.initcall##level##.init)) \
    KEEP(*(.initcall##level##s.init)) \

    #define INIT_CALLS \
    __initcall_start = .; \
    KEEP(*(.initcallearly.init)) \
    INIT_CALLS_LEVEL(0) \
    INIT_CALLS_LEVEL(1) \
    INIT_CALLS_LEVEL(2) \
    INIT_CALLS_LEVEL(3) \
    INIT_CALLS_LEVEL(4) \
    INIT_CALLS_LEVEL(5) \
    INIT_CALLS_LEVEL(rootfs) \
    INIT_CALLS_LEVEL(6) \
    INIT_CALLS_LEVEL(7) \
    __initcall_end = .;

  2. section的调整规则?

    在vmlinux.lds的SECTIONS部分,针对.init.data section,通过以上宏扩展,
    部分扩展如下__initcall5_start = .; KEEP(*(.initcall5.init))
    强制将.initcall5.init section保留并合并到.init.data section分区

启动调用分析

我们将各级的.initcall都整合到.init.data section之后链接到vmlinux。此时各对象(比如trace.o)下.initcall5.init的symbol都链接到.init.data的同一块内存区域,
组成了一块从__initcall5_start地址开始的一个类型为initcall_t的数组, 可以理解为 initcall_t __initcall5_start[]; 在init.c中就可以通过__initcall5_start这个数组去调用每一个
通过fs_initcall调用的模块接口

下面是具体的调用实现

__initcall5_start调用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
//init/main.c
extern initcall_entry_t __initcall5_start[];
...
extern initcall_entry_t __initcall_end[];

static initcall_entry_t *initcall_levels[] __initdata = {
__initcall0_start,
__initcall1_start,
__initcall2_start,
__initcall3_start,
__initcall4_start,
__initcall5_start,
__initcall6_start,
__initcall7_start,
__initcall_end,
};

//从名字可以看出每个阶段应该做的事情
static const char *initcall_level_names[] __initdata = {
"pure",
"core",
"postcore",
"arch",
"subsys",
"fs",
"device",
"late",
};


static void __init do_initcall_level(int level, char *command_line)
{
initcall_entry_t *fn;

parse_args(initcall_level_names[level],
command_line, __start___param,
__stop___param - __start___param,
level, level,
NULL, ignore_unknown_bootoption);

trace_initcall_level(initcall_level_names[level]);
for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) >>获取连续的每一个5级别symbol内存存放的地址
do_one_initcall(initcall_from_entry(fn));>> *fn获取每一个5级别函数的symbol
}

int __init_or_module do_one_initcall(initcall_t fn)
{
...
do_trace_initcall_start(fn);
ret = fn(); >>开始真正的调用
do_trace_initcall_finish(fn, ret);
...
}


启动调用gdb分析

大坑疑问

在设置gdb断点时碰到了大坑,代码部分如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
1359 static void __init do_initcall_level(int level, char *command_line)
1360 {
1361 initcall_entry_t *fn;
1362
1363 parse_args(initcall_level_names[level],
1364 command_line, __start___param,
1365 __stop___param - __start___param,
1366 level, level,
1367 NULL, ignore_unknown_bootoption);
1368
1369 trace_initcall_level(initcall_level_names[level]);
1370 for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
1371 do_one_initcall(initcall_from_entry(fn)); >>offset_to_ptr(entry);//CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y
1372 }

下面是疑问?

  1. b do_initcall_level可以设置成功,但是symbol里应该没有这个函数,但是为什么可以设置成功

    1
    2
    (gdb) b do_initcall_level
    Breakpoint 3 at 0xffffffff82d3197d: file init/main.c, line 1365.
  2. b do_initcall_level既然可以设置成功,但是为什么断点只触发一次?

  3. 为什么设置的断点在line 1365.

    1
    2
    3
    (gdb) info symbol 0xffffffff82d3197d 
    kernel_init_freeable + 367 in section .init.text

  4. 设置b init/main.c:1359为什么也只触发一次?

    1
    2
    (gdb) b init/main.c:1359
    Breakpoint 1 at 0xffffffff82d31984: file init/main.c, line 1363.
  5. 设置b __initcall__kmod_nmi__312_102_nmi_warning_debugfs5无效?但是symbol对应的地址都是正确的。 –>因为symbol的类型不是function

    1
    2
    3
    4
    (gdb) info address __initcall__kmod_nmi__312_102_nmi_warning_debugfs5
    Symbol "__initcall__kmod_nmi__312_102_nmi_warning_debugfs5" is at 0xffffffff82ebb71c in a file compiled without debugging.
    (gdb) b __initcall__kmod_nmi__312_102_nmi_warning_debugfs5
    Function "__initcall__kmod_nmi__312_102_nmi_warning_debugfs5" not defined.

分析CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y情况

为了分析.init.data section中的.initcall5.init内存地址,设置如下断点

  1. init/main.c:1369或者trace_initcall_level(更好)

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
     //设置断点init/main.c:1369时的触发打印
    (gdb) info b
    Num Type Disp Enb Address What
    breakpoint keep y 0xffffffff82d319f1 in do_initcall_level at init/main.c:1369

    Continuing.

    Thread 1 hit Breakpoint 1, do_initcall_level (command_line=0xffff888003783100 "root",
    level=<optimized out>) at init/main.c:1369
    1369 trace_initcall_level(initcall_level_names[level]);

    (gdb) p level
    $1 = <optimized out> >>编译器优化无法查看level,无法判定走到哪个级别的内存处理


    //设置断点trace_initcall_level时的触发打印
    (gdb) b trace_initcall_level
    Breakpoint 1 at 0xffffffff81b07600: trace_initcall_level. (2 locations)
    (gdb) info b
    Num Type Disp Enb Address What
    1 breakpoint keep y <MULTIPLE>
    1.1 y 0xffffffff81b07600 in trace_initcall_level
    at ./arch/x86/include/asm/jump_label.h:27
    1.2 y 0xffffffff82d591ba in trace_initcall_level
    at ./arch/x86/include/asm/jump_label.h:27

    (gdb) c
    Continuing.
    [Switching to Thread 2]

    Thread 2 hit Breakpoint 1, trace_initcall_level (level=0xffffffff8237c4a7 "fs") >>此时可以看到initcall_level_names
    at ./include/trace/events/initcall.h:10
    10 TRACE_EVENT(initcall_level,

  2. init/main.c:1370do_one_initcall 到了流程跑到级别5的时候再去设置

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    trace_initcall_level (level=0xffffffff8237c4a7 "fs") >>级别5

    (gdb) b init/main.c:1370
    Breakpoint 2 at 0xffffffff82d319fe: file init/main.c, line 1370.
    (gdb) b do_one_initcall
    Breakpoint 3 at 0xffffffff81000f90: file init/main.c, line 1289.

    //do_one_initcall是有symbol的
    ffffffff81000f90 g F .text 00000000000001d6 do_one_initcall

    (gdb) c
    Continuing.

    Thread 1 hit Breakpoint 2, do_initcall_level (command_line=0xffff888003783100 "root",
    level=<optimized out>) at init/main.c:1370
    1370 for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
    (gdb) p fn
    $1 = <optimized out> >>fn也被编译器优化了
    (gdb) p level
    $2 = <optimized out>

此时开始检查内存中存放.initcall5.init section的内存存储的symbol

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
(gdb) c
Continuing.

Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d3dd47 <nmi_warning_debugfs>)
at init/main.c:1289
1289 {


(gdb) p initcall_levels
$3 = {0xffffffff82ebb2b4, 0xffffffff82ebb2c8, 0xffffffff82ebb3c0, 0xffffffff82ebb444, 0xffffffff82ebb4a8, 0xffffffff82ebb71c, 0xffffffff82ebb848, 0xffffffff82ebbc28, 0xffffffff82ebbd84}
(gdb) p initcall_levels[5]
$6 = (initcall_entry_t *) 0xffffffff82ebb71c

//此时发现0xffffffff82ebb71c为 initcall5.init section的首地址
ffffffff82ebb71c g .init.data 0000000000000000 __initcall5_start
//也是第一个symbol的地址值,此地址应该存放的是nmi_warning_debugfs的symbol地址
ffffffff82ebb71c l .init.data 0000000000000000 __initcall__kmod_nmi__312_102_nmi_warning_debugfs5

//ffffffff82d3dd47 l F .init.text 0000000000000027 nmi_warning_debugfs
(gdb) x/x 0xffffffff82ebb71c
0xffffffff82ebb71c: 0xffe8ae5dffe8262b >>为什么不是nmi_warning_debugfs的symbol 0xffffffff82d3dd47呢?这儿是偏移

(gdb) c
Continuing.

Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d4657d <save_microcode_in_initrd>) at init/main.c:1289
1289 {
(gdb) p fn
$10 = (initcall_t) 0xffffffff82d4657d <save_microcode_in_initrd>
(gdb) c
Continuing.

Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d4e996 <hpet_late_init>) at init/main.c:1289
1289 {
(gdb) c
Continuing.

Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d4ecfa <init_amd_nbs>) at init/main.c:1289
1289 {
(gdb) c
Continuing.

Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d5608d <iomem_init_inode>) at init/main.c:1289
1289 {
(gdb) c
Continuing.

Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d5aded <clocksource_done_booting>) at init/main.c:1289
1289 {
(gdb) c
Continuing.

Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d5e986 <tracer_init_tracefs>) at init/main.c:1289
1289 {
(gdb) p fn
$11 = (initcall_t) 0xffffffff82d5e986 <tracer_init_tracefs>

能发现.initcall5.init section的内存地址的存储的symbol地址如下

1
2
3
4
5
6
7
8
9
10
11
ffffffff82d3dd47 l     F .init.text     0000000000000027 nmi_warning_debugfs <--

ffffffff82d4657d l F .init.text 00000000000000a3 save_microcode_in_initrd <--

ffffffff82d4e996 l F .init.text 0000000000000364 hpet_late_init <--

...

ffffffff82d5aded l F .init.text 0000000000000042 clocksource_done_booting <--

ffffffff82d5e986 l F .init.text 00000000000002b4 tracer_init_tracefs <--

fn是initcall_entry_t *类型,fn++是地址偏移 指针指向类型 的大小,从symbol来看,.initcall5.init section下的symbol都是地址都是偏移4个字节,此时发现.config配置为CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y

1
2
3
4
5
6
ffffffff82ebb71c l       .init.data     0000000000000000 __initcall__kmod_nmi__312_102_nmi_warning_debugfs5
ffffffff82ebb720 l .init.data 0000000000000000 __initcall__kmod_microcode__275_891_save_microcode_in_initrd5
ffffffff82ebb724 l .init.data 0000000000000000 __initcall__kmod_hpet__211_1165_hpet_late_init5
...
ffffffff82ebb730 l .init.data 0000000000000000 __initcall__kmod_clocksource__211_1038_clocksource_done_booting5
ffffffff82ebb734 l .init.data 0000000000000000 __initcall__kmod_trace__397_9768_tracer_init_tracefs5

地址类型如下

1
2
3
4
5
6
(gdb) p sizeof(initcall_entry_t *)
$12 = 8
(gdb) p sizeof(initcall_entry_t) CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y typedef int initcall_entry_t;
$13 = 4
(gdb) p sizeof(initcall_t)
$14 = 8

可以发现问题, 从.initcall5.init section下的内存取出来的symbol 地址不对

1
2
3
4
5
6
7
(gdb) x/x 0xffffffff82ebb71c
0xffffffff82ebb71c: 0xffe8ae5dffe8262b
(gdb) x/x 0xffffffff82ebb720
0xffffffff82ebb720: 0xffe93272ffe8ae5d
(gdb) x/x 0xffffffff82ebb724
0xffffffff82ebb724: 0xffe935d2ffe93272

怀疑问题和CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y有关

禁止编译优化分析

禁止编译优化的几种尝试

  1. 内核设置编译优化等级为O0编译失败,内核支持-O2和-Os的编译优化

    1
    2
    ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE   >>默认                                                                                                                                                                           
    KBUILD_CFLAGS += -O0
  2. 内核设置编译等级为O1,但是启动panic

  3. init部分优化, 修改init/Makefile 如下,编译失败

    1
    2
    - ccflags-y := -fno-function-sections -fno-data-sections
    + ccflags-y := -fno-function-sections -fno-data-sections -O2

    修改为-O1,依旧为optimized out

  1. 增加如下,打印log为print fn is (____ptrval____),size is 8:4

    1
    2
    3
    4
    5
    1369         trace_initcall_level(initcall_level_names[level]);
    1370 for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++){
    1371 pr_info("print fn is %p,size is %ld:%ld/n", fn, sizeof(fn), sizeof(initcall_entry_t));
    1372 do_one_initcall(initcall_from_entry(fn));
    1373 }

    重新调整打印指针为%px,打印如下

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    (gdb) c
    Continuing.

    Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d31cf6 <nmi_warning_debugfs>)
    at init/main.c:1289
    1289 {
    (gdb) c
    Continuing.

    Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d3a502 <save_microcode_in_initrd>)
    at init/main.c:1289
    1289 {

    (gdb) p initcall_levels
    $1 = {0xffffffff82ea0190, 0xffffffff82ea01a4, 0xffffffff82ea0290, 0xffffffff82ea0314,
    0xffffffff82ea0378, 0xffffffff82ea05e8, 0xffffffff82ea070c, 0xffffffff82ea0ae8, 0xffffffff82ea0c38}

    [ 1.175606] print fn is ffffffff82ea05e8,size is 8:4 >>逻辑正确,sizeof大小也正确
    [ 1.211950] print fn is ffffffff82ea05ec,size is 8:4


    (gdb) x/x 0xffffffff82ea05e8
    0xffffffff82ea05e8: 0xffe9170e >>这个怎么找到0xffffffff82d31cf6 <nmi_warning_debugfs>?

    (gdb) x/x 0xffffffff82ea05ec
    0xffffffff82ea05ec: 0xffe99f16

最终发现,0xffe9170e是一个偏移,见offset_to_ptr, 0xffffffff82d31cf6 = 0xffffffff82ea05e8 + 0xffe9170e,最终找到nmi_warning_debugfs的symbol
由此可见,CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y的情况下function symbol addr = fn addr + *fn, .initcall5.init section下存储的是偏移(也可以理解为相对地址)

  1. 以下两种是没有symbol的(-O2)

     a. static 标记的并且只被调用过一次的可能被编译器优化为inline, 比如`do_initcall_level`
     b. static inline标记的函数,比如`initcall_from_entry`

    因此可能编译优化导致参数gdb打印为optimized out

    修改为noinline如下

    1
    2
    3
    4
    5
    6
    7
    8
    9
    -static void __init do_initcall_level(int level, char *command_line)
    +static noinline void __init do_initcall_level(int level, char *command_line)

    -static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
    +static noinline initcall_t initcall_from_entry(initcall_entry_t *entry)

    -static inline void *offset_to_ptr(const int *off)
    +static noinline void *offset_to_ptr(const int *off)

    注意,因为init.h是被很多模块都包含的,但是如果没有调用initcall_from_entry,就会在编译时提示如下,可以增加__maybe_unused
    或者attribute((unused))消除警告提示

    1
    2
    3
    4

    ./include/linux/init.h:122:28: 警告:‘initcall_from_entry’ defined but not used [-Wunused-function]
    static noinline initcall_t initcall_from_entry(initcall_entry_t *entry)

    新的符号表如下:

    1
    2
    3
    4
    5
    //符号表
    ffffffff82d2578f l F .init.text 000000000000008e do_initcall_level
    ...
    ffffffff81aee60c l F .text 0000000000000007 initcall_from_entry >>这里出现两次,应该是被两个目标文件生成的 init/main.c
    ffffffff81af001d l F .text 0000000000000007 initcall_from_entry >> kernel/printk/printk.c

    调试如下: 0xffffffff82d272f2 =0xffffffff82ea074c + 0xffe86ba6

    noinline测试
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92

    (gdb) b do_initcall_level
    Breakpoint 1 at 0xffffffff82d2578f: file init/main.c, line 1360.

    (gdb) c
    Continuing.

    Thread 1 hit Breakpoint 1, do_initcall_level (level=level@entry=6,
    command_line=command_line@entry=0xffff888003c18100 "root=/dev/sda rdinit=init crashkernel=128M console=ttyS0 rw nokaslr") at init/main.c:1360
    1360 {
    (gdb) p level
    $1 = 6
    (gdb) b initcall_from_entry
    Breakpoint 2 at 0xffffffff81aee60c: initcall_from_entry. (2 locations)
    (gdb) b do_one_initcall
    Breakpoint 3 at 0xffffffff81000f90: file init/main.c, line 1289.
    (gdb) c
    Continuing.

    Thread 1 hit Breakpoint 2, initcall_from_entry (entry=entry@entry=0xffffffff82ea074c)
    at ./include/linux/init.h:123
    123 {
    (gdb) p initcall_levels
    $2 = {0xffffffff82ea01d0, 0xffffffff82ea01e4, 0xffffffff82ea02d0, 0xffffffff82ea0354,
    0xffffffff82ea03b8, 0xffffffff82ea0628, 0xffffffff82ea074c, 0xffffffff82ea0b28, 0xffffffff82ea0c78}
    (gdb) b offset_to_ptr
    Breakpoint 4 at 0xffffffff8114a7b0: offset_to_ptr. (5 locations)
    (gdb) info b
    Num Type Disp Enb Address What
    1 breakpoint keep y 0xffffffff82d2578f in do_initcall_level at init/main.c:1360
    breakpoint already hit 7 times
    2 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    2.1 y 0xffffffff81aee60c in initcall_from_entry at ./include/linux/init.h:123
    2.2 y 0xffffffff81af001d in initcall_from_entry at ./include/linux/init.h:123
    3 breakpoint keep y 0xffffffff81000f90 in do_one_initcall at init/main.c:1289
    4 breakpoint keep y <MULTIPLE>
    4.1 y 0xffffffff8114a7b0 in offset_to_ptr at ./include/linux/compiler.h:233
    4.2 y 0xffffffff8118f220 in offset_to_ptr at ./include/linux/compiler.h:233
    4.3 y 0xffffffff815f6370 in offset_to_ptr at ./include/linux/compiler.h:233
    4.4 y 0xffffffff81aee600 in offset_to_ptr at ./include/linux/compiler.h:233
    4.5 y 0xffffffff81af0011 in offset_to_ptr at ./include/linux/compiler.h:233
    (gdb) c
    Continuing.

    Thread 1 hit Breakpoint 4, offset_to_ptr (off=off@entry=0xffffffff82ea074c)
    at ./include/linux/compiler.h:233
    233 {
    (gdb) b include/linux/compiler.h:235
    Breakpoint 5 at 0xffffffff8114a7bb: include/linux/compiler.h:235. (5 locations)
    (gdb) c
    Continuing.

    Thread 1 hit Breakpoint 5, offset_to_ptr (off=off@entry=0xffffffff82ea074c)
    at ./include/linux/compiler.h:235
    235 }
    (gdb) p off
    $3 = (const int *) 0xffffffff82ea074c

    (gdb) x/x *off
    0xffffffffffe86ba6: Cannot access memory at address 0xffffffffffe86ba6

    (gdb) x/x 0xffffffff82ea074c
    0xffffffff82ea074c: 0xffe86ba6
    (gdb) info b
    Num Type Disp Enb Address What
    1 breakpoint keep y 0xffffffff82d2578f in do_initcall_level at init/main.c:1360
    breakpoint already hit 7 times
    2 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    2.1 y 0xffffffff81aee60c in initcall_from_entry at ./include/linux/init.h:123
    2.2 y 0xffffffff81af001d in initcall_from_entry at ./include/linux/init.h:123
    3 breakpoint keep y 0xffffffff81000f90 in do_one_initcall at init/main.c:1289
    4 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    4.1 y 0xffffffff8114a7b0 in offset_to_ptr at ./include/linux/compiler.h:233
    4.2 y 0xffffffff8118f220 in offset_to_ptr at ./include/linux/compiler.h:233
    4.3 y 0xffffffff815f6370 in offset_to_ptr at ./include/linux/compiler.h:233
    4.4 y 0xffffffff81aee600 in offset_to_ptr at ./include/linux/compiler.h:233
    4.5 y 0xffffffff81af0011 in offset_to_ptr at ./include/linux/compiler.h:233
    5 breakpoint keep y <MULTIPLE>
    breakpoint already hit 1 time
    5.1 y 0xffffffff8114a7bb in offset_to_ptr at ./include/linux/compiler.h:235
    5.2 y 0xffffffff8118f22b in offset_to_ptr at ./include/linux/compiler.h:235
    5.3 y 0xffffffff815f637b in offset_to_ptr at ./include/linux/compiler.h:235
    5.4 y 0xffffffff81aee60b in offset_to_ptr at ./include/linux/compiler.h:235
    5.5 y 0xffffffff81af001c in offset_to_ptr at ./include/linux/compiler.h:235
    (gdb) c
    Continuing.

    Thread 1 hit Breakpoint 3, do_one_initcall (fn=0xffffffff82d272f2 <ia32_binfmt_init>) at init/main.c:1289
    1289 {

分析CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=n情况

调整CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=n时需要将
select HAVE_ARCH_PREL32_RELOCATIONS从arch/x86/Kconfig中注释掉,因为这里将其默认使能def_bool y

此时typedef initcall_t initcall_entry_t;initcall_from_entry 直接return *entry;,此时应该.initcall5.init section下存储的是symbol地址

从vmlinux解析出来的符号表就可以验证:

1
2
3
4
5
ffffffff82ea0ab8 g       .init.data     0000000000000000 __initcall5_start

ffffffff82ea0ab8 l O .init.data 0000000000000008 __initcall__kmod_nmi__312_102_nmi_warning_debugfs5
ffffffff82ea0ac0 l O .init.data 0000000000000008 __initcall__kmod_microcode__275_891_save_microcode_in_initrd5 >>偏移是8字节
ffffffff82ea0ac8 l O .init.data 0000000000000008 __initcall__kmod_hpet__210_1165_hpet_late_init5

从内存存储空间来看,直接显示内存存储的指针指向funtion空间

1
2
3
4
5
6
7
//这块内存需要在init没有释放前去打印
(gdb) x/x 0xffffffff82ea0ab8
0xffffffff82ea0ab8 <__initcall__kmod_nmi__312_102_nmi_warning_debugfs5>: 0x82d31cd6
(gdb) x/x 0xffffffff82ea0ac0
0xffffffff82ea0ac0 <__initcall__kmod_microcode__275_891_save_microcode_in_initrd5>: 0x82d3a4e2
(gdb) x/x 0xffffffff82ea0ac8
0xffffffff82ea0ac8 <__initcall__kmod_hpet__210_1165_hpet_late_init5>: 0x82d4284d

参考

  1. 宋宝华: 关于Linux编译优化几个必须掌握的姿势
  2. Linux内核是如何巧妙的初始化各个模块的