mm/mm_init.c - stable | Gitverse (2024)

1

// SPDX-License-Identifier: GPL-2.0-only

2

/*

3

 * mm_init.c - Memory initialisation verification and debugging

4

 *

5

 * Copyright 2008 IBM Corporation, 2008

6

 * Author Mel Gorman <mel@csn.ul.ie>

7

 *

8

 */

9

#include <linux/kernel.h>

10

#include <linux/init.h>

11

#include <linux/kobject.h>

12

#include <linux/export.h>

13

#include <linux/memory.h>

14

#include <linux/notifier.h>

15

#include <linux/sched.h>

16

#include <linux/mman.h>

17

#include <linux/memblock.h>

18

#include <linux/page-isolation.h>

19

#include <linux/padata.h>

20

#include <linux/nmi.h>

21

#include <linux/buffer_head.h>

22

#include <linux/kmemleak.h>

23

#include <linux/kfence.h>

24

#include <linux/page_ext.h>

25

#include <linux/pti.h>

26

#include <linux/pgtable.h>

27

#include <linux/stackdepot.h>

28

#include <linux/swap.h>

29

#include <linux/cma.h>

30

#include <linux/crash_dump.h>

31

#include <linux/execmem.h>

32

#include "internal.h"

33

#include "slab.h"

34

#include "shuffle.h"

35

36

#include <asm/setup.h>

37

38

#ifdef CONFIG_DEBUG_MEMORY_INIT

39

int __meminitdata mminit_loglevel;

40

41

/* The zonelists are simply reported, validation is manual. */

42

void __init mminit_verify_zonelist(void)

43

{

44

int nid;

45

46

if (mminit_loglevel < MMINIT_VERIFY)

47

return;

48

49

for_each_online_node(nid) {

50

pg_data_t *pgdat = NODE_DATA(nid);

51

struct zone *zone;

52

struct zoneref *z;

53

struct zonelist *zonelist;

54

int i, listid, zoneid;

55

56

BUILD_BUG_ON(MAX_ZONELISTS > 2);

57

for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {

58

59

/* Identify the zone and nodelist */

60

zoneid = i % MAX_NR_ZONES;

61

listid = i / MAX_NR_ZONES;

62

zonelist = &pgdat->node_zonelists[listid];

63

zone = &pgdat->node_zones[zoneid];

64

if (!populated_zone(zone))

65

continue;

66

67

/* Print information about the zonelist */

68

printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",

69

listid > 0 ? "thisnode" : "general", nid,

70

zone->name);

71

72

/* Iterate the zonelist */

73

for_each_zone_zonelist(zone, z, zonelist, zoneid)

74

pr_cont("%d:%s ", zone_to_nid(zone), zone->name);

75

pr_cont("\n");

76

}

77

}

78

}

79

80

void __init mminit_verify_pageflags_layout(void)

81

{

82

int shift, width;

83

unsigned long or_mask, add_mask;

84

85

shift = BITS_PER_LONG;

86

width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH

87

- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;

88

mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",

89

"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",

90

SECTIONS_WIDTH,

91

NODES_WIDTH,

92

ZONES_WIDTH,

93

LAST_CPUPID_WIDTH,

94

KASAN_TAG_WIDTH,

95

LRU_GEN_WIDTH,

96

LRU_REFS_WIDTH,

97

NR_PAGEFLAGS);

98

mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",

99

"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",

100

SECTIONS_SHIFT,

101

NODES_SHIFT,

102

ZONES_SHIFT,

103

LAST_CPUPID_SHIFT,

104

KASAN_TAG_WIDTH);

105

mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",

106

"Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",

107

(unsigned long)SECTIONS_PGSHIFT,

108

(unsigned long)NODES_PGSHIFT,

109

(unsigned long)ZONES_PGSHIFT,

110

(unsigned long)LAST_CPUPID_PGSHIFT,

111

(unsigned long)KASAN_TAG_PGSHIFT);

112

mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",

113

"Node/Zone ID: %lu -> %lu\n",

114

(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),

115

(unsigned long)ZONEID_PGOFF);

116

mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",

117

"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",

118

shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);

119

#ifdef NODE_NOT_IN_PAGE_FLAGS

120

mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",

121

"Node not in page flags");

122

#endif

123

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS

124

mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",

125

"Last cpupid not in page flags");

126

#endif

127

128

if (SECTIONS_WIDTH) {

129

shift -= SECTIONS_WIDTH;

130

BUG_ON(shift != SECTIONS_PGSHIFT);

131

}

132

if (NODES_WIDTH) {

133

shift -= NODES_WIDTH;

134

BUG_ON(shift != NODES_PGSHIFT);

135

}

136

if (ZONES_WIDTH) {

137

shift -= ZONES_WIDTH;

138

BUG_ON(shift != ZONES_PGSHIFT);

139

}

140

141

/* Check for bitmask overlaps */

142

or_mask = (ZONES_MASK << ZONES_PGSHIFT) |

143

(NODES_MASK << NODES_PGSHIFT) |

144

(SECTIONS_MASK << SECTIONS_PGSHIFT);

145

add_mask = (ZONES_MASK << ZONES_PGSHIFT) +

146

(NODES_MASK << NODES_PGSHIFT) +

147

(SECTIONS_MASK << SECTIONS_PGSHIFT);

148

BUG_ON(or_mask != add_mask);

149

}

150

151

static __init int set_mminit_loglevel(char *str)

152

{

153

get_option(&str, &mminit_loglevel);

154

return 0;

155

}

156

early_param("mminit_loglevel", set_mminit_loglevel);

157

#endif /* CONFIG_DEBUG_MEMORY_INIT */

158

159

struct kobject *mm_kobj;

160

161

#ifdef CONFIG_SMP

162

s32 vm_committed_as_batch = 32;

163

164

void mm_compute_batch(int overcommit_policy)

165

{

166

u64 memsized_batch;

167

s32 nr = num_present_cpus();

168

s32 batch = max_t(s32, nr*2, 32);

169

unsigned long ram_pages = totalram_pages();

170

171

/*

172

 * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of

173

 * (total memory/#cpus), and lift it to 25% for other policies

174

 * to easy the possible lock contention for percpu_counter

175

 * vm_committed_as, while the max limit is INT_MAX

176

 */

177

if (overcommit_policy == OVERCOMMIT_NEVER)

178

memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);

179

else

180

memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);

181

182

vm_committed_as_batch = max_t(s32, memsized_batch, batch);

183

}

184

185

static int __meminit mm_compute_batch_notifier(struct notifier_block *self,

186

unsigned long action, void *arg)

187

{

188

switch (action) {

189

case MEM_ONLINE:

190

case MEM_OFFLINE:

191

mm_compute_batch(sysctl_overcommit_memory);

192

break;

193

default:

194

break;

195

}

196

return NOTIFY_OK;

197

}

198

199

static int __init mm_compute_batch_init(void)

200

{

201

mm_compute_batch(sysctl_overcommit_memory);

202

hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);

203

return 0;

204

}

205

206

__initcall(mm_compute_batch_init);

207

208

#endif

209

210

static int __init mm_sysfs_init(void)

211

{

212

mm_kobj = kobject_create_and_add("mm", kernel_kobj);

213

if (!mm_kobj)

214

return -ENOMEM;

215

216

return 0;

217

}

218

postcore_initcall(mm_sysfs_init);

219

220

static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;

221

static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;

222

static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;

223

224

static unsigned long required_kernelcore __initdata;

225

static unsigned long required_kernelcore_percent __initdata;

226

static unsigned long required_movablecore __initdata;

227

static unsigned long required_movablecore_percent __initdata;

228

229

static unsigned long nr_kernel_pages __initdata;

230

static unsigned long nr_all_pages __initdata;

231

232

static bool deferred_struct_pages __meminitdata;

233

234

static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);

235

236

static int __init cmdline_parse_core(char *p, unsigned long *core,

237

 unsigned long *percent)

238

{

239

unsigned long long coremem;

240

char *endptr;

241

242

if (!p)

243

return -EINVAL;

244

245

/* Value may be a percentage of total memory, otherwise bytes */

246

coremem = simple_strtoull(p, &endptr, 0);

247

if (*endptr == '%') {

248

/* Paranoid check for percent values greater than 100 */

249

WARN_ON(coremem > 100);

250

251

*percent = coremem;

252

} else {

253

coremem = memparse(p, &p);

254

/* Paranoid check that UL is enough for the coremem value */

255

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

256

257

*core = coremem >> PAGE_SHIFT;

258

*percent = 0UL;

259

}

260

return 0;

261

}

262

263

bool mirrored_kernelcore __initdata_memblock;

264

265

/*

266

 * kernelcore=size sets the amount of memory for use for allocations that

267

 * cannot be reclaimed or migrated.

268

 */

269

static int __init cmdline_parse_kernelcore(char *p)

270

{

271

/* parse kernelcore=mirror */

272

if (parse_option_str(p, "mirror")) {

273

mirrored_kernelcore = true;

274

return 0;

275

}

276

277

return cmdline_parse_core(p, &required_kernelcore,

278

 &required_kernelcore_percent);

279

}

280

early_param("kernelcore", cmdline_parse_kernelcore);

281

282

/*

283

 * movablecore=size sets the amount of memory for use for allocations that

284

 * can be reclaimed or migrated.

285

 */

286

static int __init cmdline_parse_movablecore(char *p)

287

{

288

return cmdline_parse_core(p, &required_movablecore,

289

 &required_movablecore_percent);

290

}

291

early_param("movablecore", cmdline_parse_movablecore);

292

293

/*

294

 * early_calculate_totalpages()

295

 * Sum pages in active regions for movable zone.

296

 * Populate N_MEMORY for calculating usable_nodes.

297

 */

298

static unsigned long __init early_calculate_totalpages(void)

299

{

300

unsigned long totalpages = 0;

301

unsigned long start_pfn, end_pfn;

302

int i, nid;

303

304

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

305

unsigned long pages = end_pfn - start_pfn;

306

307

totalpages += pages;

308

if (pages)

309

node_set_state(nid, N_MEMORY);

310

}

311

return totalpages;

312

}

313

314

/*

315

 * This finds a zone that can be used for ZONE_MOVABLE pages. The

316

 * assumption is made that zones within a node are ordered in monotonic

317

 * increasing memory addresses so that the "highest" populated zone is used

318

 */

319

static void __init find_usable_zone_for_movable(void)

320

{

321

int zone_index;

322

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

323

if (zone_index == ZONE_MOVABLE)

324

continue;

325

326

if (arch_zone_highest_possible_pfn[zone_index] >

327

arch_zone_lowest_possible_pfn[zone_index])

328

break;

329

}

330

331

VM_BUG_ON(zone_index == -1);

332

movable_zone = zone_index;

333

}

334

335

/*

336

 * Find the PFN the Movable zone begins in each node. Kernel memory

337

 * is spread evenly between nodes as long as the nodes have enough

338

 * memory. When they don't, some nodes will have more kernelcore than

339

 * others

340

 */

341

static void __init find_zone_movable_pfns_for_nodes(void)

342

{

343

int i, nid;

344

unsigned long usable_startpfn;

345

unsigned long kernelcore_node, kernelcore_remaining;

346

/* save the state before borrow the nodemask */

347

nodemask_t saved_node_state = node_states[N_MEMORY];

348

unsigned long totalpages = early_calculate_totalpages();

349

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

350

struct memblock_region *r;

351

352

/* Need to find movable_zone earlier when movable_node is specified. */

353

find_usable_zone_for_movable();

354

355

/*

356

 * If movable_node is specified, ignore kernelcore and movablecore

357

 * options.

358

 */

359

if (movable_node_is_enabled()) {

360

for_each_mem_region(r) {

361

if (!memblock_is_hotpluggable(r))

362

continue;

363

364

nid = memblock_get_region_node(r);

365

366

usable_startpfn = PFN_DOWN(r->base);

367

zone_movable_pfn[nid] = zone_movable_pfn[nid] ?

368

min(usable_startpfn, zone_movable_pfn[nid]) :

369

usable_startpfn;

370

}

371

372

goto out2;

373

}

374

375

/*

376

 * If kernelcore=mirror is specified, ignore movablecore option

377

 */

378

if (mirrored_kernelcore) {

379

bool mem_below_4gb_not_mirrored = false;

380

381

if (!memblock_has_mirror()) {

382

pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");

383

goto out;

384

}

385

386

if (is_kdump_kernel()) {

387

pr_warn("The system is under kdump, ignore kernelcore=mirror.\n");

388

goto out;

389

}

390

391

for_each_mem_region(r) {

392

if (memblock_is_mirror(r))

393

continue;

394

395

nid = memblock_get_region_node(r);

396

397

usable_startpfn = memblock_region_memory_base_pfn(r);

398

399

if (usable_startpfn < PHYS_PFN(SZ_4G)) {

400

mem_below_4gb_not_mirrored = true;

401

continue;

402

}

403

404

zone_movable_pfn[nid] = zone_movable_pfn[nid] ?

405

min(usable_startpfn, zone_movable_pfn[nid]) :

406

usable_startpfn;

407

}

408

409

if (mem_below_4gb_not_mirrored)

410

pr_warn("This configuration results in unmirrored kernel memory.\n");

411

412

goto out2;

413

}

414

415

/*

416

 * If kernelcore=nn% or movablecore=nn% was specified, calculate the

417

 * amount of necessary memory.

418

 */

419

if (required_kernelcore_percent)

420

required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /

421

 10000UL;

422

if (required_movablecore_percent)

423

required_movablecore = (totalpages * 100 * required_movablecore_percent) /

424

10000UL;

425

426

/*

427

 * If movablecore= was specified, calculate what size of

428

 * kernelcore that corresponds so that memory usable for

429

 * any allocation type is evenly spread. If both kernelcore

430

 * and movablecore are specified, then the value of kernelcore

431

 * will be used for required_kernelcore if it's greater than

432

 * what movablecore would have allowed.

433

 */

434

if (required_movablecore) {

435

unsigned long corepages;

436

437

/*

438

 * Round-up so that ZONE_MOVABLE is at least as large as what

439

 * was requested by the user

440

 */

441

required_movablecore =

442

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

443

required_movablecore = min(totalpages, required_movablecore);

444

corepages = totalpages - required_movablecore;

445

446

required_kernelcore = max(required_kernelcore, corepages);

447

}

448

449

/*

450

 * If kernelcore was not specified or kernelcore size is larger

451

 * than totalpages, there is no ZONE_MOVABLE.

452

 */

453

if (!required_kernelcore || required_kernelcore >= totalpages)

454

goto out;

455

456

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

457

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

458

459

restart:

460

/* Spread kernelcore memory as evenly as possible throughout nodes */

461

kernelcore_node = required_kernelcore / usable_nodes;

462

for_each_node_state(nid, N_MEMORY) {

463

unsigned long start_pfn, end_pfn;

464

465

/*

466

 * Recalculate kernelcore_node if the division per node

467

 * now exceeds what is necessary to satisfy the requested

468

 * amount of memory for the kernel

469

 */

470

if (required_kernelcore < kernelcore_node)

471

kernelcore_node = required_kernelcore / usable_nodes;

472

473

/*

474

 * As the map is walked, we track how much memory is usable

475

 * by the kernel using kernelcore_remaining. When it is

476

 * 0, the rest of the node is usable by ZONE_MOVABLE

477

 */

478

kernelcore_remaining = kernelcore_node;

479

480

/* Go through each range of PFNs within this node */

481

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

482

unsigned long size_pages;

483

484

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

485

if (start_pfn >= end_pfn)

486

continue;

487

488

/* Account for what is only usable for kernelcore */

489

if (start_pfn < usable_startpfn) {

490

unsigned long kernel_pages;

491

kernel_pages = min(end_pfn, usable_startpfn)

492

- start_pfn;

493

494

kernelcore_remaining -= min(kernel_pages,

495

kernelcore_remaining);

496

required_kernelcore -= min(kernel_pages,

497

required_kernelcore);

498

499

/* Continue if range is now fully accounted */

500

if (end_pfn <= usable_startpfn) {

501

502

/*

503

 * Push zone_movable_pfn to the end so

504

 * that if we have to rebalance

505

 * kernelcore across nodes, we will

506

 * not double account here

507

 */

508

zone_movable_pfn[nid] = end_pfn;

509

continue;

510

}

511

start_pfn = usable_startpfn;

512

}

513

514

/*

515

 * The usable PFN range for ZONE_MOVABLE is from

516

 * start_pfn->end_pfn. Calculate size_pages as the

517

 * number of pages used as kernelcore

518

 */

519

size_pages = end_pfn - start_pfn;

520

if (size_pages > kernelcore_remaining)

521

size_pages = kernelcore_remaining;

522

zone_movable_pfn[nid] = start_pfn + size_pages;

523

524

/*

525

 * Some kernelcore has been met, update counts and

526

 * break if the kernelcore for this node has been

527

 * satisfied

528

 */

529

required_kernelcore -= min(required_kernelcore,

530

size_pages);

531

kernelcore_remaining -= size_pages;

532

if (!kernelcore_remaining)

533

break;

534

}

535

}

536

537

/*

538

 * If there is still required_kernelcore, we do another pass with one

539

 * less node in the count. This will push zone_movable_pfn[nid] further

540

 * along on the nodes that still have memory until kernelcore is

541

 * satisfied

542

 */

543

usable_nodes--;

544

if (usable_nodes && required_kernelcore > usable_nodes)

545

goto restart;

546

547

out2:

548

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

549

for (nid = 0; nid < MAX_NUMNODES; nid++) {

550

unsigned long start_pfn, end_pfn;

551

552

zone_movable_pfn[nid] =

553

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

554

555

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

556

if (zone_movable_pfn[nid] >= end_pfn)

557

zone_movable_pfn[nid] = 0;

558

}

559

560

out:

561

/* restore the node_state */

562

node_states[N_MEMORY] = saved_node_state;

563

}

564

565

void __meminit __init_single_page(struct page *page, unsigned long pfn,

566

unsigned long zone, int nid)

567

{

568

mm_zero_struct_page(page);

569

set_page_links(page, zone, nid, pfn);

570

init_page_count(page);

571

page_mapcount_reset(page);

572

page_cpupid_reset_last(page);

573

page_kasan_tag_reset(page);

574

575

INIT_LIST_HEAD(&page->lru);

576

#ifdef WANT_PAGE_VIRTUAL

577

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

578

if (!is_highmem_idx(zone))

579

set_page_address(page, __va(pfn << PAGE_SHIFT));

580

#endif

581

}

582

583

#ifdef CONFIG_NUMA

584

/*

585

 * During memory init memblocks map pfns to nids. The search is expensive and

586

 * this caches recent lookups. The implementation of __early_pfn_to_nid

587

 * treats start/end as pfns.

588

 */

589

struct mminit_pfnnid_cache {

590

unsigned long last_start;

591

unsigned long last_end;

592

int last_nid;

593

};

594

595

static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;

596

597

/*

598

 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

599

 */

600

static int __meminit __early_pfn_to_nid(unsigned long pfn,

601

struct mminit_pfnnid_cache *state)

602

{

603

unsigned long start_pfn, end_pfn;

604

int nid;

605

606

if (state->last_start <= pfn && pfn < state->last_end)

607

return state->last_nid;

608

609

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

610

if (nid != NUMA_NO_NODE) {

611

state->last_start = start_pfn;

612

state->last_end = end_pfn;

613

state->last_nid = nid;

614

}

615

616

return nid;

617

}

618

619

int __meminit early_pfn_to_nid(unsigned long pfn)

620

{

621

static DEFINE_SPINLOCK(early_pfn_lock);

622

int nid;

623

624

spin_lock(&early_pfn_lock);

625

nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);

626

if (nid < 0)

627

nid = first_online_node;

628

spin_unlock(&early_pfn_lock);

629

630

return nid;

631

}

632

633

int hashdist = HASHDIST_DEFAULT;

634

635

static int __init set_hashdist(char *str)

636

{

637

if (!str)

638

return 0;

639

hashdist = simple_strtoul(str, &str, 0);

640

return 1;

641

}

642

__setup("hashdist=", set_hashdist);

643

644

static inline void fixup_hashdist(void)

645

{

646

if (num_node_state(N_MEMORY) == 1)

647

hashdist = 0;

648

}

649

#else

650

static inline void fixup_hashdist(void) {}

651

#endif /* CONFIG_NUMA */

652

653

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

654

static inline void pgdat_set_deferred_range(pg_data_t *pgdat)

655

{

656

pgdat->first_deferred_pfn = ULONG_MAX;

657

}

658

659

/* Returns true if the struct page for the pfn is initialised */

660

static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)

661

{

662

if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)

663

return false;

664

665

return true;

666

}

667

668

/*

669

 * Returns true when the remaining initialisation should be deferred until

670

 * later in the boot cycle when it can be parallelised.

671

 */

672

static bool __meminit

673

defer_init(int nid, unsigned long pfn, unsigned long end_pfn)

674

{

675

static unsigned long prev_end_pfn, nr_initialised;

676

677

if (early_page_ext_enabled())

678

return false;

679

/*

680

 * prev_end_pfn static that contains the end of previous zone

681

 * No need to protect because called very early in boot before smp_init.

682

 */

683

if (prev_end_pfn != end_pfn) {

684

prev_end_pfn = end_pfn;

685

nr_initialised = 0;

686

}

687

688

/* Always populate low zones for address-constrained allocations */

689

if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))

690

return false;

692

if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)

693

return true;

694

/*

695

 * We start only with one section of pages, more pages are added as

696

 * needed until the rest of deferred pages are initialized.

697

 */

698

nr_initialised++;

699

if ((nr_initialised > PAGES_PER_SECTION) &&

700

 (pfn & (PAGES_PER_SECTION - 1)) == 0) {

701

NODE_DATA(nid)->first_deferred_pfn = pfn;

702

return true;

703

}

704

return false;

705

}

706

707

static void __meminit init_reserved_page(unsigned long pfn, int nid)

708

{

709

pg_data_t *pgdat;

710

int zid;

711

712

if (early_page_initialised(pfn, nid))

713

return;

714

715

pgdat = NODE_DATA(nid);

716

717

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

718

struct zone *zone = &pgdat->node_zones[zid];

719

720

if (zone_spans_pfn(zone, pfn))

721

break;

722

}

723

__init_single_page(pfn_to_page(pfn), pfn, zid, nid);

724

}

725

#else

726

static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}

727

728

static inline bool early_page_initialised(unsigned long pfn, int nid)

729

{

730

return true;

731

}

732

733

static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)

734

{

735

return false;

736

}

737

738

static inline void init_reserved_page(unsigned long pfn, int nid)

739

{

740

}

741

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

742

743

/*

744

 * Initialised pages do not have PageReserved set. This function is

745

 * called for each range allocated by the bootmem allocator and

746

 * marks the pages PageReserved. The remaining valid pages are later

747

 * sent to the buddy page allocator.

748

 */

749

void __meminit reserve_bootmem_region(phys_addr_t start,

750

 phys_addr_t end, int nid)

751

{

752

unsigned long start_pfn = PFN_DOWN(start);

753

unsigned long end_pfn = PFN_UP(end);

754

755

for (; start_pfn < end_pfn; start_pfn++) {

756

if (pfn_valid(start_pfn)) {

757

struct page *page = pfn_to_page(start_pfn);

758

759

init_reserved_page(start_pfn, nid);

760

761

/* Avoid false-positive PageTail() */

762

INIT_LIST_HEAD(&page->lru);

763

764

/*

765

 * no need for atomic set_bit because the struct

766

 * page is not visible yet so nobody should

767

 * access it yet.

768

 */

769

__SetPageReserved(page);

770

}

771

}

772

}

773

774

/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */

775

static bool __meminit

776

overlap_memmap_init(unsigned long zone, unsigned long *pfn)

777

{

778

static struct memblock_region *r;

779

780

if (mirrored_kernelcore && zone == ZONE_MOVABLE) {

781

if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {

782

for_each_mem_region(r) {

783

if (*pfn < memblock_region_memory_end_pfn(r))

784

break;

785

}

786

}

787

if (*pfn >= memblock_region_memory_base_pfn(r) &&

788

 memblock_is_mirror(r)) {

789

*pfn = memblock_region_memory_end_pfn(r);

790

return true;

791

}

792

}

793

return false;

794

}

795

796

/*

797

 * Only struct pages that correspond to ranges defined by memblock.memory

798

 * are zeroed and initialized by going through __init_single_page() during

799

 * memmap_init_zone_range().

800

 *

801

 * But, there could be struct pages that correspond to holes in

802

 * memblock.memory. This can happen because of the following reasons:

803

 * - physical memory bank size is not necessarily the exact multiple of the

804

 * arbitrary section size

805

 * - early reserved memory may not be listed in memblock.memory

806

 * - non-memory regions covered by the contigious flatmem mapping

807

 * - memory layouts defined with memmap= kernel parameter may not align

808

 * nicely with memmap sections

809

 *

810

 * Explicitly initialize those struct pages so that:

811

 * - PG_Reserved is set

812

 * - zone and node links point to zone and node that span the page if the

813

 * hole is in the middle of a zone

814

 * - zone and node links point to adjacent zone/node if the hole falls on

815

 * the zone boundary; the pages in such holes will be prepended to the

816

 * zone/node above the hole except for the trailing pages in the last

817

 * section that will be appended to the zone/node below.

818

 */

819

static void __init init_unavailable_range(unsigned long spfn,

820

 unsigned long epfn,

821

 int zone, int node)

822

{

823

unsigned long pfn;

824

u64 pgcnt = 0;

825

826

for (pfn = spfn; pfn < epfn; pfn++) {

827

if (!pfn_valid(pageblock_start_pfn(pfn))) {

828

pfn = pageblock_end_pfn(pfn) - 1;

829

continue;

830

}

831

__init_single_page(pfn_to_page(pfn), pfn, zone, node);

832

__SetPageReserved(pfn_to_page(pfn));

833

pgcnt++;

834

}

835

836

if (pgcnt)

837

pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n",

838

node, zone_names[zone], pgcnt);

839

}

840

841

/*

842

 * Initially all pages are reserved - free ones are freed

843

 * up by memblock_free_all() once the early boot process is

844

 * done. Non-atomic initialization, single-pass.

845

 *

846

 * All aligned pageblocks are initialized to the specified migratetype

847

 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related

848

 * zone stats (e.g., nr_isolate_pageblock) are touched.

849

 */

850

void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,

851

unsigned long start_pfn, unsigned long zone_end_pfn,

852

enum meminit_context context,

853

struct vmem_altmap *altmap, int migratetype)

854

{

855

unsigned long pfn, end_pfn = start_pfn + size;

856

struct page *page;

857

858

if (highest_memmap_pfn < end_pfn - 1)

859

highest_memmap_pfn = end_pfn - 1;

860

861

#ifdef CONFIG_ZONE_DEVICE

862

/*

863

 * Honor reservation requested by the driver for this ZONE_DEVICE

864

 * memory. We limit the total number of pages to initialize to just

865

 * those that might contain the memory mapping. We will defer the

866

 * ZONE_DEVICE page initialization until after we have released

867

 * the hotplug lock.

868

 */

869

if (zone == ZONE_DEVICE) {

870

if (!altmap)

871

return;

872

873

if (start_pfn == altmap->base_pfn)

874

start_pfn += altmap->reserve;

875

end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);

876

}

877

#endif

878

879

for (pfn = start_pfn; pfn < end_pfn; ) {

880

/*

881

 * There can be holes in boot-time mem_map[]s handed to this

882

 * function. They do not exist on hotplugged memory.

883

 */

884

if (context == MEMINIT_EARLY) {

885

if (overlap_memmap_init(zone, &pfn))

886

continue;

887

if (defer_init(nid, pfn, zone_end_pfn)) {

888

deferred_struct_pages = true;

889

break;

890

}

891

}

892

893

page = pfn_to_page(pfn);

894

__init_single_page(page, pfn, zone, nid);

895

if (context == MEMINIT_HOTPLUG)

896

__SetPageReserved(page);

897

898

/*

899

 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,

900

 * such that unmovable allocations won't be scattered all

901

 * over the place during system boot.

902

 */

903

if (pageblock_aligned(pfn)) {

904

set_pageblock_migratetype(page, migratetype);

905

cond_resched();

906

}

907

pfn++;

908

}

909

}

910

911

static void __init memmap_init_zone_range(struct zone *zone,

912

 unsigned long start_pfn,

913

 unsigned long end_pfn,

914

 unsigned long *hole_pfn)

915

{

916

unsigned long zone_start_pfn = zone->zone_start_pfn;

917

unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;

918

int nid = zone_to_nid(zone), zone_id = zone_idx(zone);

919

920

start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);

921

end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);

922

923

if (start_pfn >= end_pfn)

924

return;

925

926

memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,

927

 zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);

928

929

if (*hole_pfn < start_pfn)

930

init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);

931

932

*hole_pfn = end_pfn;

933

}

934

935

static void __init memmap_init(void)

936

{

937

unsigned long start_pfn, end_pfn;

938

unsigned long hole_pfn = 0;

939

int i, j, zone_id = 0, nid;

940

941

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

942

struct pglist_data *node = NODE_DATA(nid);

943

944

for (j = 0; j < MAX_NR_ZONES; j++) {

945

struct zone *zone = node->node_zones + j;

946

947

if (!populated_zone(zone))

948

continue;

949

950

memmap_init_zone_range(zone, start_pfn, end_pfn,

951

 &hole_pfn);

952

zone_id = j;

953

}

954

}

955

956

#ifdef CONFIG_SPARSEMEM

957

/*

958

 * Initialize the memory map for hole in the range [memory_end,

959

 * section_end].

960

 * Append the pages in this hole to the highest zone in the last

961

 * node.

962

 * The call to init_unavailable_range() is outside the ifdef to

963

 * silence the compiler warining about zone_id set but not used;

964

 * for FLATMEM it is a nop anyway

965

 */

966

end_pfn = round_up(end_pfn, PAGES_PER_SECTION);

967

if (hole_pfn < end_pfn)

968

#endif

969

init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);

970

}

971

972

#ifdef CONFIG_ZONE_DEVICE

973

static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,

974

 unsigned long zone_idx, int nid,

975

 struct dev_pagemap *pgmap)

976

{

977

978

__init_single_page(page, pfn, zone_idx, nid);

979

980

/*

981

 * Mark page reserved as it will need to wait for onlining

982

 * phase for it to be fully associated with a zone.

983

 *

984

 * We can use the non-atomic __set_bit operation for setting

985

 * the flag as we are still initializing the pages.

986

 */

987

__SetPageReserved(page);

988

989

/*

990

 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer

991

 * and zone_device_data. It is a bug if a ZONE_DEVICE page is

992

 * ever freed or placed on a driver-private list.

993

 */

994

page->pgmap = pgmap;

995

page->zone_device_data = NULL;

996

997

/*

998

 * Mark the block movable so that blocks are reserved for

999

 * movable at startup. This will force kernel allocations

1000

 * to reserve their blocks rather than leaking throughout

1001

 * the address space during boot when many long-lived

1002

 * kernel allocations are made.

1003

 *

1004

 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap

1005

 * because this is done early in section_activate()

1006

 */

1007

if (pageblock_aligned(pfn)) {

1008

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

1009

cond_resched();

1010

}

1011

1012

/*

1013

 * ZONE_DEVICE pages are released directly to the driver page allocator

1014

 * which will set the page count to 1 when allocating the page.

1015

 */

1016

if (pgmap->type == MEMORY_DEVICE_PRIVATE ||

1017

 pgmap->type == MEMORY_DEVICE_COHERENT)

1018

set_page_count(page, 0);

1019

}

1020

1021

/*

1022

 * With compound page geometry and when struct pages are stored in ram most

1023

 * tail pages are reused. Consequently, the amount of unique struct pages to

1024

 * initialize is a lot smaller that the total amount of struct pages being

1025

 * mapped. This is a paired / mild layering violation with explicit knowledge

1026

 * of how the sparse_vmemmap internals handle compound pages in the lack

1027

 * of an altmap. See vmemmap_populate_compound_pages().

1028

 */

1029

static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,

1030

 struct dev_pagemap *pgmap)

1031

{

1032

if (!vmemmap_can_optimize(altmap, pgmap))

1033

return pgmap_vmemmap_nr(pgmap);

1034

1035

return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));

1036

}

1037

1038

static void __ref memmap_init_compound(struct page *head,

1039

 unsigned long head_pfn,

1040

 unsigned long zone_idx, int nid,

1041

 struct dev_pagemap *pgmap,

1042

 unsigned long nr_pages)

1043

{

1044

unsigned long pfn, end_pfn = head_pfn + nr_pages;

1045

unsigned int order = pgmap->vmemmap_shift;

1046

1047

__SetPageHead(head);

1048

for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {

1049

struct page *page = pfn_to_page(pfn);

1050

1051

__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);

1052

prep_compound_tail(head, pfn - head_pfn);

1053

set_page_count(page, 0);

1054

1055

/*

1056

 * The first tail page stores important compound page info.

1057

 * Call prep_compound_head() after the first tail page has

1058

 * been initialized, to not have the data overwritten.

1059

 */

1060

if (pfn == head_pfn + 1)

1061

prep_compound_head(head, order);

1062

}

1063

}

1064

1065

void __ref memmap_init_zone_device(struct zone *zone,

1066

 unsigned long start_pfn,

1067

 unsigned long nr_pages,

1068

 struct dev_pagemap *pgmap)

1069

{

1070

unsigned long pfn, end_pfn = start_pfn + nr_pages;

1071

struct pglist_data *pgdat = zone->zone_pgdat;

1072

struct vmem_altmap *altmap = pgmap_altmap(pgmap);

1073

unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);

1074

unsigned long zone_idx = zone_idx(zone);

1075

unsigned long start = jiffies;

1076

int nid = pgdat->node_id;

1077

1078

if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))

1079

return;

1080

1081

/*

1082

 * The call to memmap_init should have already taken care

1083

 * of the pages reserved for the memmap, so we can just jump to

1084

 * the end of that region and start processing the device pages.

1085

 */

1086

if (altmap) {

1087

start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);

1088

nr_pages = end_pfn - start_pfn;

1089

}

1090

1091

for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {

1092

struct page *page = pfn_to_page(pfn);

1093

1094

__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);

1095

1096

if (pfns_per_compound == 1)

1097

continue;

1098

1099

memmap_init_compound(page, pfn, zone_idx, nid, pgmap,

1100

 compound_nr_pages(altmap, pgmap));

1101

}

1102

1103

pr_debug("%s initialised %lu pages in %ums\n", __func__,

1104

nr_pages, jiffies_to_msecs(jiffies - start));

1105

}

1106

#endif

1107

1108

/*

1109

 * The zone ranges provided by the architecture do not include ZONE_MOVABLE

1110

 * because it is sized independent of architecture. Unlike the other zones,

1111

 * the starting point for ZONE_MOVABLE is not fixed. It may be different

1112

 * in each node depending on the size of each node and how evenly kernelcore

1113

 * is distributed. This helper function adjusts the zone ranges

1114

 * provided by the architecture for a given node by using the end of the

1115

 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that

1116

 * zones within a node are in order of monotonic increases memory addresses

1117

 */

1118

static void __init adjust_zone_range_for_zone_movable(int nid,

1119

unsigned long zone_type,

1120

unsigned long node_end_pfn,

1121

unsigned long *zone_start_pfn,

1122

unsigned long *zone_end_pfn)

1123

{

1124

/* Only adjust if ZONE_MOVABLE is on this node */

1125

if (zone_movable_pfn[nid]) {

1126

/* Size ZONE_MOVABLE */

1127

if (zone_type == ZONE_MOVABLE) {

1128

*zone_start_pfn = zone_movable_pfn[nid];

1129

*zone_end_pfn = min(node_end_pfn,

1130

arch_zone_highest_possible_pfn[movable_zone]);

1131

1132

/* Adjust for ZONE_MOVABLE starting within this range */

1133

} else if (!mirrored_kernelcore &&

1134

*zone_start_pfn < zone_movable_pfn[nid] &&

1135

*zone_end_pfn > zone_movable_pfn[nid]) {

1136

*zone_end_pfn = zone_movable_pfn[nid];

1137

1138

/* Check if this whole range is within ZONE_MOVABLE */

1139

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

1140

*zone_start_pfn = *zone_end_pfn;

1141

}

1142

}

1143

1144

/*

1145

 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

1146

 * then all holes in the requested range will be accounted for.

1147

 */

1148

static unsigned long __init __absent_pages_in_range(int nid,

1149

unsigned long range_start_pfn,

1150

unsigned long range_end_pfn)

1151

{

1152

unsigned long nr_absent = range_end_pfn - range_start_pfn;

1153

unsigned long start_pfn, end_pfn;

1154

int i;

1155

1156

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

1157

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

1158

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

1159

nr_absent -= end_pfn - start_pfn;

1160

}

1161

return nr_absent;

1162

}

1163

1164

/**

1165

 * absent_pages_in_range - Return number of page frames in holes within a range

1166

 * @start_pfn: The start PFN to start searching for holes

1167

 * @end_pfn: The end PFN to stop searching for holes

1168

 *

1169

 * Return: the number of pages frames in memory holes within a range.

1170

 */

1171

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

1172

unsigned long end_pfn)

1173

{

1174

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

1175

}

1176

1177

/* Return the number of page frames in holes in a zone on a node */

1178

static unsigned long __init zone_absent_pages_in_node(int nid,

1179

unsigned long zone_type,

1180

unsigned long zone_start_pfn,

1181

unsigned long zone_end_pfn)

1182

{

1183

unsigned long nr_absent;

1184

1185

/* zone is empty, we don't have any absent pages */

1186

if (zone_start_pfn == zone_end_pfn)

1187

return 0;

1188

1189

nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

1190

1191

/*

1192

 * ZONE_MOVABLE handling.

1193

 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages

1194

 * and vice versa.

1195

 */

1196

if (mirrored_kernelcore && zone_movable_pfn[nid]) {

1197

unsigned long start_pfn, end_pfn;

1198

struct memblock_region *r;

1199

1200

for_each_mem_region(r) {

1201

start_pfn = clamp(memblock_region_memory_base_pfn(r),

1202

 zone_start_pfn, zone_end_pfn);

1203

end_pfn = clamp(memblock_region_memory_end_pfn(r),

1204

zone_start_pfn, zone_end_pfn);

1205

1206

if (zone_type == ZONE_MOVABLE &&

1207

 memblock_is_mirror(r))

1208

nr_absent += end_pfn - start_pfn;

1209

1210

if (zone_type == ZONE_NORMAL &&

1211

 !memblock_is_mirror(r))

1212

nr_absent += end_pfn - start_pfn;

1213

}

1214

}

1215

1216

return nr_absent;

1217

}

1218

1219

/*

1220

 * Return the number of pages a zone spans in a node, including holes

1221

 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

1222

 */

1223

static unsigned long __init zone_spanned_pages_in_node(int nid,

1224

unsigned long zone_type,

1225

unsigned long node_start_pfn,

1226

unsigned long node_end_pfn,

1227

unsigned long *zone_start_pfn,

1228

unsigned long *zone_end_pfn)

1229

{

1230

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

1231

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

1232

1233

/* Get the start and end of the zone */

1234

*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

1235

*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

1236

adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,

1237

 zone_start_pfn, zone_end_pfn);

1238

1239

/* Check that this node has pages within the zone's required range */

1240

if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)

1241

return 0;

1242

1243

/* Move the zone boundaries inside the node if necessary */

1244

*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);

1245

*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);

1246

1247

/* Return the spanned pages */

1248

return *zone_end_pfn - *zone_start_pfn;

1249

}

1250

1251

static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)

1252

{

1253

struct zone *z;

1254

1255

for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {

1256

z->zone_start_pfn = 0;

1257

z->spanned_pages = 0;

1258

z->present_pages = 0;

1259

#if defined(CONFIG_MEMORY_HOTPLUG)

1260

z->present_early_pages = 0;

1261

#endif

1262

}

1263

1264

pgdat->node_spanned_pages = 0;

1265

pgdat->node_present_pages = 0;

1266

pr_debug("On node %d totalpages: 0\n", pgdat->node_id);

1267

}

1268

1269

static void __init calc_nr_kernel_pages(void)

1270

{

1271

unsigned long start_pfn, end_pfn;

1272

phys_addr_t start_addr, end_addr;

1273

u64 u;

1274

#ifdef CONFIG_HIGHMEM

1275

unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];

1276

#endif

1277

1278

for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {

1279

start_pfn = PFN_UP(start_addr);

1280

end_pfn = PFN_DOWN(end_addr);

1281

1282

if (start_pfn < end_pfn) {

1283

nr_all_pages += end_pfn - start_pfn;

1284

#ifdef CONFIG_HIGHMEM

1285

start_pfn = clamp(start_pfn, 0, high_zone_low);

1286

end_pfn = clamp(end_pfn, 0, high_zone_low);

1287

#endif

1288

nr_kernel_pages += end_pfn - start_pfn;

1289

}

1290

}

1291

}

1292

1293

static void __init calculate_node_totalpages(struct pglist_data *pgdat,

1294

unsigned long node_start_pfn,

1295

unsigned long node_end_pfn)

1296

{

1297

unsigned long realtotalpages = 0, totalpages = 0;

1298

enum zone_type i;

1299

1300

for (i = 0; i < MAX_NR_ZONES; i++) {

1301

struct zone *zone = pgdat->node_zones + i;

1302

unsigned long zone_start_pfn, zone_end_pfn;

1303

unsigned long spanned, absent;

1304

unsigned long real_size;

1305

1306

spanned = zone_spanned_pages_in_node(pgdat->node_id, i,

1307

 node_start_pfn,

1308

 node_end_pfn,

1309

 &zone_start_pfn,

1310

 &zone_end_pfn);

1311

absent = zone_absent_pages_in_node(pgdat->node_id, i,

1312

 zone_start_pfn,

1313

 zone_end_pfn);

1314

1315

real_size = spanned - absent;

1316

1317

if (spanned)

1318

zone->zone_start_pfn = zone_start_pfn;

1319

else

1320

zone->zone_start_pfn = 0;

1321

zone->spanned_pages = spanned;

1322

zone->present_pages = real_size;

1323

#if defined(CONFIG_MEMORY_HOTPLUG)

1324

zone->present_early_pages = real_size;

1325

#endif

1326

1327

totalpages += spanned;

1328

realtotalpages += real_size;

1329

}

1330

1331

pgdat->node_spanned_pages = totalpages;

1332

pgdat->node_present_pages = realtotalpages;

1333

pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);

1334

}

1335

1336

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

1337

static void pgdat_init_split_queue(struct pglist_data *pgdat)

1338

{

1339

struct deferred_split *ds_queue = &pgdat->deferred_split_queue;

1340

1341

spin_lock_init(&ds_queue->split_queue_lock);

1342

INIT_LIST_HEAD(&ds_queue->split_queue);

1343

ds_queue->split_queue_len = 0;

1344

}

1345

#else

1346

static void pgdat_init_split_queue(struct pglist_data *pgdat) {}

1347

#endif

1348

1349

#ifdef CONFIG_COMPACTION

1350

static void pgdat_init_kcompactd(struct pglist_data *pgdat)

1351

{

1352

init_waitqueue_head(&pgdat->kcompactd_wait);

1353

}

1354

#else

1355

static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}

1356

#endif

1357

1358

static void __meminit pgdat_init_internals(struct pglist_data *pgdat)

1359

{

1360

int i;

1361

1362

pgdat_resize_init(pgdat);

1363

pgdat_kswapd_lock_init(pgdat);

1364

1365

pgdat_init_split_queue(pgdat);

1366

pgdat_init_kcompactd(pgdat);

1367

1368

init_waitqueue_head(&pgdat->kswapd_wait);

1369

init_waitqueue_head(&pgdat->pfmemalloc_wait);

1370

1371

for (i = 0; i < NR_VMSCAN_THROTTLE; i++)

1372

init_waitqueue_head(&pgdat->reclaim_wait[i]);

1373

1374

pgdat_page_ext_init(pgdat);

1375

lruvec_init(&pgdat->__lruvec);

1376

}

1377

1378

static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,

1379

unsigned long remaining_pages)

1380

{

1381

atomic_long_set(&zone->managed_pages, remaining_pages);

1382

zone_set_nid(zone, nid);

1383

zone->name = zone_names[idx];

1384

zone->zone_pgdat = NODE_DATA(nid);

1385

spin_lock_init(&zone->lock);

1386

zone_seqlock_init(zone);

1387

zone_pcp_init(zone);

1388

}

1389

1390

static void __meminit zone_init_free_lists(struct zone *zone)

1391

{

1392

unsigned int order, t;

1393

for_each_migratetype_order(order, t) {

1394

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

1395

zone->free_area[order].nr_free = 0;

1396

}

1397

1398

#ifdef CONFIG_UNACCEPTED_MEMORY

1399

INIT_LIST_HEAD(&zone->unaccepted_pages);

1400

#endif

1401

}

1402

1403

void __meminit init_currently_empty_zone(struct zone *zone,

1404

unsigned long zone_start_pfn,

1405

unsigned long size)

1406

{

1407

struct pglist_data *pgdat = zone->zone_pgdat;

1408

int zone_idx = zone_idx(zone) + 1;

1409

1410

if (zone_idx > pgdat->nr_zones)

1411

pgdat->nr_zones = zone_idx;

1412

1413

zone->zone_start_pfn = zone_start_pfn;

1414

1415

mminit_dprintk(MMINIT_TRACE, "memmap_init",

1416

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

1417

pgdat->node_id,

1418

(unsigned long)zone_idx(zone),

1419

zone_start_pfn, (zone_start_pfn + size));

1420

1421

zone_init_free_lists(zone);

1422

zone->initialized = 1;

1423

}

1424

1425

#ifndef CONFIG_SPARSEMEM

1426

/*

1427

 * Calculate the size of the zone->blockflags rounded to an unsigned long

1428

 * Start by making sure zonesize is a multiple of pageblock_order by rounding

1429

 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

1430

 * round what is now in bits to nearest long in bits, then return it in

1431

 * bytes.

1432

 */

1433

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

1434

{

1435

unsigned long usemapsize;

1436

1437

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

1438

usemapsize = roundup(zonesize, pageblock_nr_pages);

1439

usemapsize = usemapsize >> pageblock_order;

1440

usemapsize *= NR_PAGEBLOCK_BITS;

1441

usemapsize = roundup(usemapsize, BITS_PER_LONG);

1442

1443

return usemapsize / BITS_PER_BYTE;

1444

}

1445

1446

static void __ref setup_usemap(struct zone *zone)

1447

{

1448

unsigned long usemapsize = usemap_size(zone->zone_start_pfn,

1449

 zone->spanned_pages);

1450

zone->pageblock_flags = NULL;

1451

if (usemapsize) {

1452

zone->pageblock_flags =

1453

memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,

1454

 zone_to_nid(zone));

1455

if (!zone->pageblock_flags)

1456

panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",

1457

 usemapsize, zone->name, zone_to_nid(zone));

1458

}

1459

}

1460

#else

1461

static inline void setup_usemap(struct zone *zone) {}

1462

#endif /* CONFIG_SPARSEMEM */

1463

1464

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

1465

1466

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

1467

void __init set_pageblock_order(void)

1468

{

1469

unsigned int order = MAX_PAGE_ORDER;

1470

1471

/* Check that pageblock_nr_pages has not already been setup */

1472

if (pageblock_order)

1473

return;

1474

1475

/* Don't let pageblocks exceed the maximum allocation granularity. */

1476

if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)

1477

order = HUGETLB_PAGE_ORDER;

1478

1479

/*

1480

 * Assume the largest contiguous order of interest is a huge page.

1481

 * This value may be variable depending on boot parameters on powerpc.

1482

 */

1483

pageblock_order = order;

1484

}

1485

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

1486

1487

/*

1488

 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

1489

 * is unused as pageblock_order is set at compile-time. See

1490

 * include/linux/pageblock-flags.h for the values of pageblock_order based on

1491

 * the kernel config

1492

 */

1493

void __init set_pageblock_order(void)

1494

{

1495

}

1496

1497

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

1498

1499

/*

1500

 * Set up the zone data structures

1501

 * - init pgdat internals

1502

 * - init all zones belonging to this node

1503

 *

1504

 * NOTE: this function is only called during memory hotplug

1505

 */

1506

#ifdef CONFIG_MEMORY_HOTPLUG

1507

void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)

1508

{

1509

int nid = pgdat->node_id;

1510

enum zone_type z;

1511

int cpu;

1512

1513

pgdat_init_internals(pgdat);

1514

1515

if (pgdat->per_cpu_nodestats == &boot_nodestats)

1516

pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);

1517

1518

/*

1519

 * Reset the nr_zones, order and highest_zoneidx before reuse.

1520

 * Note that kswapd will init kswapd_highest_zoneidx properly

1521

 * when it starts in the near future.

1522

 */

1523

pgdat->nr_zones = 0;

1524

pgdat->kswapd_order = 0;

1525

pgdat->kswapd_highest_zoneidx = 0;

1526

pgdat->node_start_pfn = 0;

1527

pgdat->node_present_pages = 0;

1528

1529

for_each_online_cpu(cpu) {

1530

struct per_cpu_nodestat *p;

1531

1532

p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);

1533

memset(p, 0, sizeof(*p));

1534

}

1535

1536

/*

1537

 * When memory is hot-added, all the memory is in offline state. So

1538

 * clear all zones' present_pages and managed_pages because they will

1539

 * be updated in online_pages() and offline_pages().

1540

 */

1541

for (z = 0; z < MAX_NR_ZONES; z++) {

1542

struct zone *zone = pgdat->node_zones + z;

1543

1544

zone->present_pages = 0;

1545

zone_init_internals(zone, z, nid, 0);

1546

}

1547

}

1548

#endif

1549

1550

static void __init free_area_init_core(struct pglist_data *pgdat)

1551

{

1552

enum zone_type j;

1553

int nid = pgdat->node_id;

1554

1555

pgdat_init_internals(pgdat);

1556

pgdat->per_cpu_nodestats = &boot_nodestats;

1557

1558

for (j = 0; j < MAX_NR_ZONES; j++) {

1559

struct zone *zone = pgdat->node_zones + j;

1560

unsigned long size = zone->spanned_pages;

1561

1562

/*

1563

 * Initialize zone->managed_pages as 0 , it will be reset

1564

 * when memblock allocator frees pages into buddy system.

1565

 */

1566

zone_init_internals(zone, j, nid, zone->present_pages);

1567

1568

if (!size)

1569

continue;

1570

1571

setup_usemap(zone);

1572

init_currently_empty_zone(zone, zone->zone_start_pfn, size);

1573

}

1574

}

1575

1576

void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,

1577

 phys_addr_t min_addr, int nid, bool exact_nid)

1578

{

1579

void *ptr;

1580

1581

if (exact_nid)

1582

ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,

1583

 MEMBLOCK_ALLOC_ACCESSIBLE,

1584

 nid);

1585

else

1586

ptr = memblock_alloc_try_nid_raw(size, align, min_addr,

1587

 MEMBLOCK_ALLOC_ACCESSIBLE,

1588

 nid);

1589

1590

if (ptr && size > 0)

1591

page_init_poison(ptr, size);

1592

1593

return ptr;

1594

}

1595

1596

#ifdef CONFIG_FLATMEM

1597

static void __init alloc_node_mem_map(struct pglist_data *pgdat)

1598

{

1599

unsigned long start, offset, size, end;

1600

struct page *map;

1601

1602

/* Skip empty nodes */

1603

if (!pgdat->node_spanned_pages)

1604

return;

1605

1606

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

1607

offset = pgdat->node_start_pfn - start;

1608

/*

1609

 * The zone's endpoints aren't required to be MAX_PAGE_ORDER

1610

 * aligned but the node_mem_map endpoints must be in order

1611

 * for the buddy allocator to function correctly.

1612

 */

1613

end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES);

1614

size = (end - start) * sizeof(struct page);

1615

map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,

1616

 pgdat->node_id, false);

1617

if (!map)

1618

panic("Failed to allocate %ld bytes for node %d memory map\n",

1619

 size, pgdat->node_id);

1620

pgdat->node_mem_map = map + offset;

1621

pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",

1622

 __func__, pgdat->node_id, (unsigned long)pgdat,

1623

 (unsigned long)pgdat->node_mem_map);

1624

#ifndef CONFIG_NUMA

1625

/* the global mem_map is just set as node 0's */

1626

if (pgdat == NODE_DATA(0)) {

1627

mem_map = NODE_DATA(0)->node_mem_map;

1628

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

1629

mem_map -= offset;

1630

}

1631

#endif

1632

}

1633

#else

1634

static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }

1635

#endif /* CONFIG_FLATMEM */

1636

1637

/**

1638

 * get_pfn_range_for_nid - Return the start and end page frames for a node

1639

 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

1640

 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.

1641

 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.

1642

 *

1643

 * It returns the start and end page frame of a node based on information

1644

 * provided by memblock_set_node(). If called for a node

1645

 * with no available memory, the start and end PFNs will be 0.

1646

 */

1647

void __init get_pfn_range_for_nid(unsigned int nid,

1648

unsigned long *start_pfn, unsigned long *end_pfn)

1649

{

1650

unsigned long this_start_pfn, this_end_pfn;

1651

int i;

1652

1653

*start_pfn = -1UL;

1654

*end_pfn = 0;

1655

1656

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

1657

*start_pfn = min(*start_pfn, this_start_pfn);

1658

*end_pfn = max(*end_pfn, this_end_pfn);

1659

}

1660

1661

if (*start_pfn == -1UL)

1662

*start_pfn = 0;

1663

}

1664

1665

static void __init free_area_init_node(int nid)

1666

{

1667

pg_data_t *pgdat = NODE_DATA(nid);

1668

unsigned long start_pfn = 0;

1669

unsigned long end_pfn = 0;

1670

1671

/* pg_data_t should be reset to zero when it's allocated */

1672

WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);

1673

1674

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

1675

1676

pgdat->node_id = nid;

1677

pgdat->node_start_pfn = start_pfn;

1678

pgdat->per_cpu_nodestats = NULL;

1679

1680

if (start_pfn != end_pfn) {

1681

pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,

1682

(u64)start_pfn << PAGE_SHIFT,

1683

end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);

1684

1685

calculate_node_totalpages(pgdat, start_pfn, end_pfn);

1686

} else {

1687

pr_info("Initmem setup node %d as memoryless\n", nid);

1688

1689

reset_memoryless_node_totalpages(pgdat);

1690

}

1691

1692

alloc_node_mem_map(pgdat);

1693

pgdat_set_deferred_range(pgdat);

1694

1695

free_area_init_core(pgdat);

1696

lru_gen_init_pgdat(pgdat);

1697

}

1698

1699

/* Any regular or high memory on that node ? */

1700

static void __init check_for_memory(pg_data_t *pgdat)

1701

{

1702

enum zone_type zone_type;

1703

1704

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

1705

struct zone *zone = &pgdat->node_zones[zone_type];

1706

if (populated_zone(zone)) {

1707

if (IS_ENABLED(CONFIG_HIGHMEM))

1708

node_set_state(pgdat->node_id, N_HIGH_MEMORY);

1709

if (zone_type <= ZONE_NORMAL)

1710

node_set_state(pgdat->node_id, N_NORMAL_MEMORY);

1711

break;

1712

}

1713

}

1714

}

1715

1716

#if MAX_NUMNODES > 1

1717

/*

1718

 * Figure out the number of possible node ids.

1719

 */

1720

void __init setup_nr_node_ids(void)

1721

{

1722

unsigned int highest;

1723

1724

highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);

1725

nr_node_ids = highest + 1;

1726

}

1727

#endif

1728

1729

/*

1730

 * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For

1731

 * such cases we allow max_zone_pfn sorted in the descending order

1732

 */

1733

static bool arch_has_descending_max_zone_pfns(void)

1734

{

1735

return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);

1736

}

1737

1738

/**

1739

 * free_area_init - Initialise all pg_data_t and zone data

1740

 * @max_zone_pfn: an array of max PFNs for each zone

1741

 *

1742

 * This will call free_area_init_node() for each active node in the system.

1743

 * Using the page ranges provided by memblock_set_node(), the size of each

1744

 * zone in each node and their holes is calculated. If the maximum PFN

1745

 * between two adjacent zones match, it is assumed that the zone is empty.

1746

 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

1747

 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone

1748

 * starts where the previous one ended. For example, ZONE_DMA32 starts

1749

 * at arch_max_dma_pfn.

1750

 */

1751

void __init free_area_init(unsigned long *max_zone_pfn)

1752

{

1753

unsigned long start_pfn, end_pfn;

1754

int i, nid, zone;

1755

bool descending;

1756

1757

/* Record where the zone boundaries are */

1758

memset(arch_zone_lowest_possible_pfn, 0,

1759

sizeof(arch_zone_lowest_possible_pfn));

1760

memset(arch_zone_highest_possible_pfn, 0,

1761

sizeof(arch_zone_highest_possible_pfn));

1762

1763

start_pfn = PHYS_PFN(memblock_start_of_DRAM());

1764

descending = arch_has_descending_max_zone_pfns();

1765

1766

for (i = 0; i < MAX_NR_ZONES; i++) {

1767

if (descending)

1768

zone = MAX_NR_ZONES - i - 1;

1769

else

1770

zone = i;

1771

1772

if (zone == ZONE_MOVABLE)

1773

continue;

1774

1775

end_pfn = max(max_zone_pfn[zone], start_pfn);

1776

arch_zone_lowest_possible_pfn[zone] = start_pfn;

1777

arch_zone_highest_possible_pfn[zone] = end_pfn;

1778

1779

start_pfn = end_pfn;

1780

}

1781

1782

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

1783

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

1784

find_zone_movable_pfns_for_nodes();

1785

1786

/* Print out the zone ranges */

1787

pr_info("Zone ranges:\n");

1788

for (i = 0; i < MAX_NR_ZONES; i++) {

1789

if (i == ZONE_MOVABLE)

1790

continue;

1791

pr_info(" %-8s ", zone_names[i]);

1792

if (arch_zone_lowest_possible_pfn[i] ==

1793

arch_zone_highest_possible_pfn[i])

1794

pr_cont("empty\n");

1795

else

1796

pr_cont("[mem %#018Lx-%#018Lx]\n",

1797

(u64)arch_zone_lowest_possible_pfn[i]

1798

<< PAGE_SHIFT,

1799

((u64)arch_zone_highest_possible_pfn[i]

1800

<< PAGE_SHIFT) - 1);

1801

}

1802

1803

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

1804

pr_info("Movable zone start for each node\n");

1805

for (i = 0; i < MAX_NUMNODES; i++) {

1806

if (zone_movable_pfn[i])

1807

pr_info(" Node %d: %#018Lx\n", i,

1808

 (u64)zone_movable_pfn[i] << PAGE_SHIFT);

1809

}

1810

1811

/*

1812

 * Print out the early node map, and initialize the

1813

 * subsection-map relative to active online memory ranges to

1814

 * enable future "sub-section" extensions of the memory map.

1815

 */

1816

pr_info("Early memory node ranges\n");

1817

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

1818

pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,

1819

(u64)start_pfn << PAGE_SHIFT,

1820

((u64)end_pfn << PAGE_SHIFT) - 1);

1821

subsection_map_init(start_pfn, end_pfn - start_pfn);

1822

}

1823

1824

/* Initialise every node */

1825

mminit_verify_pageflags_layout();

1826

setup_nr_node_ids();

1827

set_pageblock_order();

1828

1829

for_each_node(nid) {

1830

pg_data_t *pgdat;

1831

1832

if (!node_online(nid)) {

1833

/* Allocator not initialized yet */

1834

pgdat = arch_alloc_nodedata(nid);

1835

if (!pgdat)

1836

panic("Cannot allocate %zuB for node %d.\n",

1837

 sizeof(*pgdat), nid);

1838

arch_refresh_nodedata(nid, pgdat);

1839

}

1840

1841

pgdat = NODE_DATA(nid);

1842

free_area_init_node(nid);

1843

1844

/*

1845

 * No sysfs hierarcy will be created via register_one_node()

1846

 *for memory-less node because here it's not marked as N_MEMORY

1847

 *and won't be set online later. The benefit is userspace

1848

 *program won't be confused by sysfs files/directories of

1849

 *memory-less node. The pgdat will get fully initialized by

1850

 *hotadd_init_pgdat() when memory is hotplugged into this node.

1851

 */

1852

if (pgdat->node_present_pages) {

1853

node_set_state(nid, N_MEMORY);

1854

check_for_memory(pgdat);

1855

}

1856

}

1857

1858

calc_nr_kernel_pages();

1859

memmap_init();

1860

1861

/* disable hash distribution for systems with a single node */

1862

fixup_hashdist();

1863

}

1864

1865

/**

1866

 * node_map_pfn_alignment - determine the maximum internode alignment

1867

 *

1868

 * This function should be called after node map is populated and sorted.

1869

 * It calculates the maximum power of two alignment which can distinguish

1870

 * all the nodes.

1871

 *

1872

 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value

1873

 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

1874

 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

1875

 * shifted, 1GiB is enough and this function will indicate so.

1876

 *

1877

 * This is used to test whether pfn -> nid mapping of the chosen memory

1878

 * model has fine enough granularity to avoid incorrect mapping for the

1879

 * populated node map.

1880

 *

1881

 * Return: the determined alignment in pfn's. 0 if there is no alignment

1882

 * requirement (single node).

1883

 */

1884

unsigned long __init node_map_pfn_alignment(void)

1885

{

1886

unsigned long accl_mask = 0, last_end = 0;

1887

unsigned long start, end, mask;

1888

int last_nid = NUMA_NO_NODE;

1889

int i, nid;

1890

1891

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

1892

if (!start || last_nid < 0 || last_nid == nid) {

1893

last_nid = nid;

1894

last_end = end;

1895

continue;

1896

}

1897

1898

/*

1899

 * Start with a mask granular enough to pin-point to the

1900

 * start pfn and tick off bits one-by-one until it becomes

1901

 * too coarse to separate the current node from the last.

1902

 */

1903

mask = ~((1 << __ffs(start)) - 1);

1904

while (mask && last_end <= (start & (mask << 1)))

1905

mask <<= 1;

1906

1907

/* accumulate all internode masks */

1908

accl_mask |= mask;

1909

}

1910

1911

/* convert mask to number of pages */

1912

return ~accl_mask + 1;

1913

}

1914

1915

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

1916

static void __init deferred_free_range(unsigned long pfn,

1917

 unsigned long nr_pages)

1918

{

1919

struct page *page;

1920

unsigned long i;

1921

1922

if (!nr_pages)

1923

return;

1924

1925

page = pfn_to_page(pfn);

1926

1927

/* Free a large naturally-aligned chunk if possible */

1928

if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {

1929

for (i = 0; i < nr_pages; i += pageblock_nr_pages)

1930

set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);

1931

__free_pages_core(page, MAX_PAGE_ORDER);

1932

return;

1933

}

1934

1935

/* Accept chunks smaller than MAX_PAGE_ORDER upfront */

1936

accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));

1937

1938

for (i = 0; i < nr_pages; i++, page++, pfn++) {

1939

if (pageblock_aligned(pfn))

1940

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

1941

__free_pages_core(page, 0);

1942

}

1943

}

1944

1945

/* Completion tracking for deferred_init_memmap() threads */

1946

static atomic_t pgdat_init_n_undone __initdata;

1947

static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);

1948

1949

static inline void __init pgdat_init_report_one_done(void)

1950

{

1951

if (atomic_dec_and_test(&pgdat_init_n_undone))

1952

complete(&pgdat_init_all_done_comp);

1953

}

1954

1955

/*

1956

 * Returns true if page needs to be initialized or freed to buddy allocator.

1957

 *

1958

 * We check if a current MAX_PAGE_ORDER block is valid by only checking the

1959

 * validity of the head pfn.

1960

 */

1961

static inline bool __init deferred_pfn_valid(unsigned long pfn)

1962

{

1963

if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn))

1964

return false;

1965

return true;

1966

}

1967

1968

/*

1969

 * Free pages to buddy allocator. Try to free aligned pages in

1970

 * MAX_ORDER_NR_PAGES sizes.

1971

 */

1972

static void __init deferred_free_pages(unsigned long pfn,

1973

 unsigned long end_pfn)

1974

{

1975

unsigned long nr_free = 0;

1976

1977

for (; pfn < end_pfn; pfn++) {

1978

if (!deferred_pfn_valid(pfn)) {

1979

deferred_free_range(pfn - nr_free, nr_free);

1980

nr_free = 0;

1981

} else if (IS_MAX_ORDER_ALIGNED(pfn)) {

1982

deferred_free_range(pfn - nr_free, nr_free);

1983

nr_free = 1;

1984

} else {

1985

nr_free++;

1986

}

1987

}

1988

/* Free the last block of pages to allocator */

1989

deferred_free_range(pfn - nr_free, nr_free);

1990

}

1991

1992

/*

1993

 * Initialize struct pages. We minimize pfn page lookups and scheduler checks

1994

 * by performing it only once every MAX_ORDER_NR_PAGES.

1995

 * Return number of pages initialized.

1996

 */

1997

static unsigned long __init deferred_init_pages(struct zone *zone,

1998

 unsigned long pfn,

1999

 unsigned long end_pfn)

2000

{

2001

int nid = zone_to_nid(zone);

2002

unsigned long nr_pages = 0;

2003

int zid = zone_idx(zone);

2004

struct page *page = NULL;

2005

2006

for (; pfn < end_pfn; pfn++) {

2007

if (!deferred_pfn_valid(pfn)) {

2008

page = NULL;

2009

continue;

2010

} else if (!page || IS_MAX_ORDER_ALIGNED(pfn)) {

2011

page = pfn_to_page(pfn);

2012

} else {

2013

page++;

2014

}

2015

__init_single_page(page, pfn, zid, nid);

2016

nr_pages++;

2017

}

2018

return nr_pages;

2019

}

2020

2021

/*

2022

 * This function is meant to pre-load the iterator for the zone init.

2023

 * Specifically it walks through the ranges until we are caught up to the

2024

 * first_init_pfn value and exits there. If we never encounter the value we

2025

 * return false indicating there are no valid ranges left.

2026

 */

2027

static bool __init

2028

deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,

2029

 unsigned long *spfn, unsigned long *epfn,

2030

 unsigned long first_init_pfn)

2031

{

2032

u64 j;

2033

2034

/*

2035

 * Start out by walking through the ranges in this zone that have

2036

 * already been initialized. We don't need to do anything with them

2037

 * so we just need to flush them out of the system.

2038

 */

2039

for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {

2040

if (*epfn <= first_init_pfn)

2041

continue;

2042

if (*spfn < first_init_pfn)

2043

*spfn = first_init_pfn;

2044

*i = j;

2045

return true;

2046

}

2047

2048

return false;

2049

}

2050

2051

/*

2052

 * Initialize and free pages. We do it in two loops: first we initialize

2053

 * struct page, then free to buddy allocator, because while we are

2054

 * freeing pages we can access pages that are ahead (computing buddy

2055

 * page in __free_one_page()).

2056

 *

2057

 * In order to try and keep some memory in the cache we have the loop

2058

 * broken along max page order boundaries. This way we will not cause

2059

 * any issues with the buddy page computation.

2060

 */

2061

static unsigned long __init

2062

deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,

2063

 unsigned long *end_pfn)

2064

{

2065

unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);

2066

unsigned long spfn = *start_pfn, epfn = *end_pfn;

2067

unsigned long nr_pages = 0;

2068

u64 j = *i;

2069

2070

/* First we loop through and initialize the page values */

2071

for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {

2072

unsigned long t;

2073

2074

if (mo_pfn <= *start_pfn)

2075

break;

2076

2077

t = min(mo_pfn, *end_pfn);

2078

nr_pages += deferred_init_pages(zone, *start_pfn, t);

2079

2080

if (mo_pfn < *end_pfn) {

2081

*start_pfn = mo_pfn;

2082

break;

2083

}

2084

}

2085

2086

/* Reset values and now loop through freeing pages as needed */

2087

swap(j, *i);

2088

2089

for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {

2090

unsigned long t;

2091

2092

if (mo_pfn <= spfn)

2093

break;

2094

2095

t = min(mo_pfn, epfn);

2096

deferred_free_pages(spfn, t);

2097

2098

if (mo_pfn <= epfn)

2099

break;

2100

}

2101

2102

return nr_pages;

2103

}

2104

2105

static void __init

2106

deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,

2107

 void *arg)

2108

{

2109

unsigned long spfn, epfn;

2110

struct zone *zone = arg;

2111

u64 i;

2112

2113

deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);

2114

2115

/*

2116

 * Initialize and free pages in MAX_PAGE_ORDER sized increments so that

2117

 * we can avoid introducing any issues with the buddy allocator.

2118

 */

2119

while (spfn < end_pfn) {

2120

deferred_init_maxorder(&i, zone, &spfn, &epfn);

2121

cond_resched();

2122

}

2123

}

2124

2125

/* An arch may override for more concurrency. */

2126

__weak int __init

2127

deferred_page_init_max_threads(const struct cpumask *node_cpumask)

2128

{

2129

return 1;

2130

}

2131

2132

/* Initialise remaining memory on a node */

2133

static int __init deferred_init_memmap(void *data)

2134

{

2135

pg_data_t *pgdat = data;

2136

const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

2137

unsigned long spfn = 0, epfn = 0;

2138

unsigned long first_init_pfn, flags;

2139

unsigned long start = jiffies;

2140

struct zone *zone;

2141

int zid, max_threads;

2142

u64 i;

2143

2144

/* Bind memory initialisation thread to a local node if possible */

2145

if (!cpumask_empty(cpumask))

2146

set_cpus_allowed_ptr(current, cpumask);

2147

2148

pgdat_resize_lock(pgdat, &flags);

2149

first_init_pfn = pgdat->first_deferred_pfn;

2150

if (first_init_pfn == ULONG_MAX) {

2151

pgdat_resize_unlock(pgdat, &flags);

2152

pgdat_init_report_one_done();

2153

return 0;

2154

}

2155

2156

/* Sanity check boundaries */

2157

BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);

2158

BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));

2159

pgdat->first_deferred_pfn = ULONG_MAX;

2160

2161

/*

2162

 * Once we unlock here, the zone cannot be grown anymore, thus if an

2163

 * interrupt thread must allocate this early in boot, zone must be

2164

 * pre-grown prior to start of deferred page initialization.

2165

 */

2166

pgdat_resize_unlock(pgdat, &flags);

2167

2168

/* Only the highest zone is deferred so find it */

2169

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

2170

zone = pgdat->node_zones + zid;

2171

if (first_init_pfn < zone_end_pfn(zone))

2172

break;

2173

}

2174

2175

/* If the zone is empty somebody else may have cleared out the zone */

2176

if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,

2177

 first_init_pfn))

2178

goto zone_empty;

2179

2180

max_threads = deferred_page_init_max_threads(cpumask);

2181

2182

while (spfn < epfn) {

2183

unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);

2184

struct padata_mt_job job = {

2185

.thread_fn = deferred_init_memmap_chunk,

2186

.fn_arg = zone,

2187

.start = spfn,

2188

.size = epfn_align - spfn,

2189

.align = PAGES_PER_SECTION,

2190

.min_chunk = PAGES_PER_SECTION,

2191

.max_threads = max_threads,

2192

.numa_aware = false,

2193

};

2194

2195

padata_do_multithreaded(&job);

2196

deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,

2197

 epfn_align);

2198

}

2199

zone_empty:

2200

/* Sanity check that the next zone really is unpopulated */

2201

WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));

2202

2203

pr_info("node %d deferred pages initialised in %ums\n",

2204

pgdat->node_id, jiffies_to_msecs(jiffies - start));

2205

2206

pgdat_init_report_one_done();

2207

return 0;

2208

}

2209

2210

/*

2211

 * If this zone has deferred pages, try to grow it by initializing enough

2212

 * deferred pages to satisfy the allocation specified by order, rounded up to

2213

 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments

2214

 * of SECTION_SIZE bytes by initializing struct pages in increments of

2215

 * PAGES_PER_SECTION * sizeof(struct page) bytes.

2216

 *

2217

 * Return true when zone was grown, otherwise return false. We return true even

2218

 * when we grow less than requested, to let the caller decide if there are

2219

 * enough pages to satisfy the allocation.

2220

 */

2221

bool __init deferred_grow_zone(struct zone *zone, unsigned int order)

2222

{

2223

unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);

2224

pg_data_t *pgdat = zone->zone_pgdat;

2225

unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;

2226

unsigned long spfn, epfn, flags;

2227

unsigned long nr_pages = 0;

2228

u64 i;

2229

2230

/* Only the last zone may have deferred pages */

2231

if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))

2232

return false;

2233

2234

pgdat_resize_lock(pgdat, &flags);

2235

2236

/*

2237

 * If someone grew this zone while we were waiting for spinlock, return

2238

 * true, as there might be enough pages already.

2239

 */

2240

if (first_deferred_pfn != pgdat->first_deferred_pfn) {

2241

pgdat_resize_unlock(pgdat, &flags);

2242

return true;

2243

}

2244

2245

/* If the zone is empty somebody else may have cleared out the zone */

2246

if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,

2247

 first_deferred_pfn)) {

2248

pgdat->first_deferred_pfn = ULONG_MAX;

2249

pgdat_resize_unlock(pgdat, &flags);

2250

/* Retry only once. */

2251

return first_deferred_pfn != ULONG_MAX;

2252

}

2253

2254

/*

2255

 * Initialize and free pages in MAX_PAGE_ORDER sized increments so

2256

 * that we can avoid introducing any issues with the buddy

2257

 * allocator.

2258

 */

2259

while (spfn < epfn) {

2260

/* update our first deferred PFN for this section */

2261

first_deferred_pfn = spfn;

2262

2263

nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);

2264

touch_nmi_watchdog();

2265

2266

/* We should only stop along section boundaries */

2267

if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)

2268

continue;

2269

2270

/* If our quota has been met we can stop here */

2271

if (nr_pages >= nr_pages_needed)

2272

break;

2273

}

2274

2275

pgdat->first_deferred_pfn = spfn;

2276

pgdat_resize_unlock(pgdat, &flags);

2277

2278

return nr_pages > 0;

2279

}

2280

2281

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

2282

2283

#ifdef CONFIG_CMA

2284

void __init init_cma_reserved_pageblock(struct page *page)

2285

{

2286

unsigned i = pageblock_nr_pages;

2287

struct page *p = page;

2288

2289

do {

2290

__ClearPageReserved(p);

2291

set_page_count(p, 0);

2292

} while (++p, --i);

2293

2294

set_pageblock_migratetype(page, MIGRATE_CMA);

2295

set_page_refcounted(page);

2296

__free_pages(page, pageblock_order);

2297

2298

adjust_managed_page_count(page, pageblock_nr_pages);

2299

page_zone(page)->cma_pages += pageblock_nr_pages;

2300

}

2301

#endif

2302

2303

void set_zone_contiguous(struct zone *zone)

2304

{

2305

unsigned long block_start_pfn = zone->zone_start_pfn;

2306

unsigned long block_end_pfn;

2307

2308

block_end_pfn = pageblock_end_pfn(block_start_pfn);

2309

for (; block_start_pfn < zone_end_pfn(zone);

2310

block_start_pfn = block_end_pfn,

2311

 block_end_pfn += pageblock_nr_pages) {

2312

2313

block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));

2314

2315

if (!__pageblock_pfn_to_page(block_start_pfn,

2316

 block_end_pfn, zone))

2317

return;

2318

cond_resched();

2319

}

2320

2321

/* We confirm that there is no hole */

2322

zone->contiguous = true;

2323

}

2324

2325

void __init page_alloc_init_late(void)

2326

{

2327

struct zone *zone;

2328

int nid;

2329

2330

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

2331

2332

/* There will be num_node_state(N_MEMORY) threads */

2333

atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));

2334

for_each_node_state(nid, N_MEMORY) {

2335

kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);

2336

}

2337

2338

/* Block until all are initialised */

2339

wait_for_completion(&pgdat_init_all_done_comp);

2340

2341

/*

2342

 * We initialized the rest of the deferred pages. Permanently disable

2343

 * on-demand struct page initialization.

2344

 */

2345

static_branch_disable(&deferred_pages);

2346

2347

/* Reinit limits that are based on free pages after the kernel is up */

2348

files_maxfiles_init();

2349

#endif

2350

2351

buffer_init();

2352

2353

/* Discard memblock private memory */

2354

memblock_discard();

2355

2356

for_each_node_state(nid, N_MEMORY)

2357

shuffle_free_memory(NODE_DATA(nid));

2358

2359

for_each_populated_zone(zone)

2360

set_zone_contiguous(zone);

2361

2362

/* Initialize page ext after all struct pages are initialized. */

2363

if (deferred_struct_pages)

2364

page_ext_init();

2365

2366

page_alloc_sysctl_init();

2367

}

2368

2369

/*

2370

 * Adaptive scale is meant to reduce sizes of hash tables on large memory

2371

 * machines. As memory size is increased the scale is also increased but at

2372

 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory

2373

 * quadruples the scale is increased by one, which means the size of hash table

2374

 * only doubles, instead of quadrupling as well.

2375

 * Because 32-bit systems cannot have large physical memory, where this scaling

2376

 * makes sense, it is disabled on such platforms.

2377

 */

2378

#if __BITS_PER_LONG > 32

2379

#define ADAPT_SCALE_BASE(64ul << 30)

2380

#define ADAPT_SCALE_SHIFT2

2381

#define ADAPT_SCALE_NPAGES(ADAPT_SCALE_BASE >> PAGE_SHIFT)

2382

#endif

2383

2384

/*

2385

 * allocate a large system hash table from bootmem

2386

 * - it is assumed that the hash table must contain an exact power-of-2

2387

 * quantity of entries

2388

 * - limit is the number of hash buckets, not the total allocation size

2389

 */

2390

void *__init alloc_large_system_hash(const char *tablename,

2391

 unsigned long bucketsize,

2392

 unsigned long numentries,

2393

 int scale,

2394

 int flags,

2395

 unsigned int *_hash_shift,

2396

 unsigned int *_hash_mask,

2397

 unsigned long low_limit,

2398

 unsigned long high_limit)

2399

{

2400

unsigned long long max = high_limit;

2401

unsigned long log2qty, size;

2402

void *table;

2403

gfp_t gfp_flags;

2404

bool virt;

2405

bool huge;

2406

2407

/* allow the kernel cmdline to have a say */

2408

if (!numentries) {

2409

/* round applicable memory size up to nearest megabyte */

2410

numentries = nr_kernel_pages;

2411

2412

/* It isn't necessary when PAGE_SIZE >= 1MB */

2413

if (PAGE_SIZE < SZ_1M)

2414

numentries = round_up(numentries, SZ_1M / PAGE_SIZE);

2415

2416

#if __BITS_PER_LONG > 32

2417

if (!high_limit) {

2418

unsigned long adapt;

2419

2420

for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;

2421

 adapt <<= ADAPT_SCALE_SHIFT)

2422

scale++;

2423

}

2424

#endif

2425

2426

/* limit to 1 bucket per 2^scale bytes of low memory */

2427

if (scale > PAGE_SHIFT)

2428

numentries >>= (scale - PAGE_SHIFT);

2429

else

2430

numentries <<= (PAGE_SHIFT - scale);

2431

2432

if (unlikely((numentries * bucketsize) < PAGE_SIZE))

2433

numentries = PAGE_SIZE / bucketsize;

2434

}

2435

numentries = roundup_pow_of_two(numentries);

2436

2437

/* limit allocation size to 1/16 total memory by default */

2438

if (max == 0) {

2439

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

2440

do_div(max, bucketsize);

2441

}

2442

max = min(max, 0x80000000ULL);

2443

2444

if (numentries < low_limit)

2445

numentries = low_limit;

2446

if (numentries > max)

2447

numentries = max;

2448

2449

log2qty = ilog2(numentries);

2450

2451

gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;

2452

do {

2453

virt = false;

2454

size = bucketsize << log2qty;

2455

if (flags & HASH_EARLY) {

2456

if (flags & HASH_ZERO)

2457

table = memblock_alloc(size, SMP_CACHE_BYTES);

2458

else

2459

table = memblock_alloc_raw(size,

2460

 SMP_CACHE_BYTES);

2461

} else if (get_order(size) > MAX_PAGE_ORDER || hashdist) {

2462

table = vmalloc_huge(size, gfp_flags);

2463

virt = true;

2464

if (table)

2465

huge = is_vm_area_hugepages(table);

2466

} else {

2467

/*

2468

 * If bucketsize is not a power-of-two, we may free

2469

 * some pages at the end of hash table which

2470

 * alloc_pages_exact() automatically does

2471

 */

2472

table = alloc_pages_exact(size, gfp_flags);

2473

kmemleak_alloc(table, size, 1, gfp_flags);

2474

}

2475

} while (!table && size > PAGE_SIZE && --log2qty);

2476

2477

if (!table)

2478

panic("Failed to allocate %s hash table\n", tablename);

2479

2480

pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",

2481

tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,

2482

virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");

2483

2484

if (_hash_shift)

2485

*_hash_shift = log2qty;

2486

if (_hash_mask)

2487

*_hash_mask = (1 << log2qty) - 1;

2488

2489

return table;

2490

}

2491

2492

void __init memblock_free_pages(struct page *page, unsigned long pfn,

2493

unsigned int order)

2494

{

2495

if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {

2496

int nid = early_pfn_to_nid(pfn);

2497

2498

if (!early_page_initialised(pfn, nid))

2499

return;

2500

}

2501

2502

if (!kmsan_memblock_free_pages(page, order)) {

2503

/* KMSAN will take care of these pages. */

2504

return;

2505

}

2506

2507

/* pages were reserved and not allocated */

2508

if (mem_alloc_profiling_enabled()) {

2509

union codetag_ref *ref = get_page_tag_ref(page);

2510

2511

if (ref) {

2512

set_codetag_empty(ref);

2513

put_page_tag_ref(ref);

2514

}

2515

}

2516

2517

__free_pages_core(page, order);

2518

}

2519

2520

DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);

2521

EXPORT_SYMBOL(init_on_alloc);

2522

2523

DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);

2524

EXPORT_SYMBOL(init_on_free);

2525

2526

DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free);

2527

EXPORT_SYMBOL(init_mlocked_on_free);

2528

2529

static bool _init_on_alloc_enabled_early __read_mostly

2530

= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);

2531

static int __init early_init_on_alloc(char *buf)

2532

{

2533

2534

return kstrtobool(buf, &_init_on_alloc_enabled_early);

2535

}

2536

early_param("init_on_alloc", early_init_on_alloc);

2537

2538

static bool _init_on_free_enabled_early __read_mostly

2539

= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);

2540

static int __init early_init_on_free(char *buf)

2541

{

2542

return kstrtobool(buf, &_init_on_free_enabled_early);

2543

}

2544

early_param("init_on_free", early_init_on_free);

2545

2546

static bool _init_mlocked_on_free_enabled_early __read_mostly

2547

= IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON);

2548

static int __init early_init_mlocked_on_free(char *buf)

2549

{

2550

return kstrtobool(buf, &_init_mlocked_on_free_enabled_early);

2551

}

2552

early_param("init_mlocked_on_free", early_init_mlocked_on_free);

2553

2554

DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);

2555

2556

/*

2557

 * Enable static keys related to various memory debugging and hardening options.

2558

 * Some override others, and depend on early params that are evaluated in the

2559

 * order of appearance. So we need to first gather the full picture of what was

2560

 * enabled, and then make decisions.

2561

 */

2562

static void __init mem_debugging_and_hardening_init(void)

2563

{

2564

bool page_poisoning_requested = false;

2565

bool want_check_pages = false;

2566

2567

#ifdef CONFIG_PAGE_POISONING

2568

/*

2569

 * Page poisoning is debug page alloc for some arches. If

2570

 * either of those options are enabled, enable poisoning.

2571

 */

2572

if (page_poisoning_enabled() ||

2573

 (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&

2574

 debug_pagealloc_enabled())) {

2575

static_branch_enable(&_page_poisoning_enabled);

2576

page_poisoning_requested = true;

2577

want_check_pages = true;

2578

}

2579

#endif

2580

2581

if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early ||

2582

 _init_mlocked_on_free_enabled_early) &&

2583

 page_poisoning_requested) {

2584

pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "

2585

"will take precedence over init_on_alloc, init_on_free "

2586

"and init_mlocked_on_free\n");

2587

_init_on_alloc_enabled_early = false;

2588

_init_on_free_enabled_early = false;

2589

_init_mlocked_on_free_enabled_early = false;

2590

}

2591

2592

if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) {

2593

pr_info("mem auto-init: init_on_free is on, "

2594

"will take precedence over init_mlocked_on_free\n");

2595

_init_mlocked_on_free_enabled_early = false;

2596

}

2597

2598

if (_init_on_alloc_enabled_early) {

2599

want_check_pages = true;

2600

static_branch_enable(&init_on_alloc);

2601

} else {

2602

static_branch_disable(&init_on_alloc);

2603

}

2604

2605

if (_init_on_free_enabled_early) {

2606

want_check_pages = true;

2607

static_branch_enable(&init_on_free);

2608

} else {

2609

static_branch_disable(&init_on_free);

2610

}

2611

2612

if (_init_mlocked_on_free_enabled_early) {

2613

want_check_pages = true;

2614

static_branch_enable(&init_mlocked_on_free);

2615

} else {

2616

static_branch_disable(&init_mlocked_on_free);

2617

}

2618

2619

if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early ||

2620

 _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early))

2621

pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and "

2622

"init_mlocked_on_free are disabled when running KMSAN\n");

2623

2624

#ifdef CONFIG_DEBUG_PAGEALLOC

2625

if (debug_pagealloc_enabled()) {

2626

want_check_pages = true;

2627

static_branch_enable(&_debug_pagealloc_enabled);

2628

2629

if (debug_guardpage_minorder())

2630

static_branch_enable(&_debug_guardpage_enabled);

2631

}

2632

#endif

2633

2634

/*

2635

 * Any page debugging or hardening option also enables sanity checking

2636

 * of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's

2637

 * enabled already.

2638

 */

2639

if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)

2640

static_branch_enable(&check_pages_enabled);

2641

}

2642

2643

/* Report memory auto-initialization states for this boot. */

2644

static void __init report_meminit(void)

2645

{

2646

const char *stack;

2647

2648

if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))

2649

stack = "all(pattern)";

2650

else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))

2651

stack = "all(zero)";

2652

else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))

2653

stack = "byref_all(zero)";

2654

else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))

2655

stack = "byref(zero)";

2656

else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))

2657

stack = "__user(zero)";

2658

else

2659

stack = "off";

2660

2661

pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n",

2662

stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",

2663

want_init_on_free() ? "on" : "off",

2664

want_init_mlocked_on_free() ? "on" : "off");

2665

if (want_init_on_free())

2666

pr_info("mem auto-init: clearing system memory may take some time...\n");

2667

}

2668

2669

static void __init mem_init_print_info(void)

2670

{

2671

unsigned long physpages, codesize, datasize, rosize, bss_size;

2672

unsigned long init_code_size, init_data_size;

2673

2674

physpages = get_num_physpages();

2675

codesize = _etext - _stext;

2676

datasize = _edata - _sdata;

2677

rosize = __end_rodata - __start_rodata;

2678

bss_size = __bss_stop - __bss_start;

2679

init_data_size = __init_end - __init_begin;

2680

init_code_size = _einittext - _sinittext;

2681

2682

/*

2683

 * Detect special cases and adjust section sizes accordingly:

2684

 * 1) .init.* may be embedded into .data sections

2685

 * 2) .init.text.* may be out of [__init_begin, __init_end],

2686

 * please refer to arch/tile/kernel/vmlinux.lds.S.

2687

 * 3) .rodata.* may be embedded into .text or .data sections.

2688

 */

2689

#define adj_init_size(start, end, size, pos, adj) \

2690

do { \

2691

if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \

2692

size -= adj; \

2693

} while (0)

2694

2695

adj_init_size(__init_begin, __init_end, init_data_size,

2696

 _sinittext, init_code_size);

2697

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

2698

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

2699

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

2700

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

2701

2702

#undefadj_init_size

2703

2704

pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"

2705

#ifdefCONFIG_HIGHMEM

2706

", %luK highmem"

2707

#endif

2708

")\n",

2709

K(nr_free_pages()), K(physpages),

2710

codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,

2711

(init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,

2712

K(physpages - totalram_pages() - totalcma_pages),

2713

K(totalcma_pages)

2714

#ifdefCONFIG_HIGHMEM

2715

, K(totalhigh_pages())

2716

#endif

2717

);

2718

}

2719

2720

/*

2721

 * Set up kernel memory allocators

2722

 */

2723

void __init mm_core_init(void)

2724

{

2725

/* Initializations relying on SMP setup */

2726

build_all_zonelists(NULL);

2727

page_alloc_init_cpuhp();

2728

2729

/*

2730

 * page_ext requires contiguous pages,

2731

 * bigger than MAX_PAGE_ORDER unless SPARSEMEM.

2732

 */

2733

page_ext_init_flatmem();

2734

mem_debugging_and_hardening_init();

2735

kfence_alloc_pool_and_metadata();

2736

report_meminit();

2737

kmsan_init_shadow();

2738

stack_depot_early_init();

2739

mem_init();

2740

mem_init_print_info();

2741

kmem_cache_init();

2742

/*

2743

 * page_owner must be initialized after buddy is ready, and also after

2744

 * slab is ready so that stack_depot_init() works properly

2745

 */

2746

page_ext_init_flatmem_late();

2747

kmemleak_init();

2748

ptlock_cache_init();

2749

pgtable_cache_init();

2750

debug_objects_mem_init();

2751

vmalloc_init();

2752

/* If no deferred init page_ext now, as vmap is fully initialized */

2753

if (!deferred_struct_pages)

2754

page_ext_init();

2755

/* Should be run before the first non-init thread is created */

2756

init_espfix_bsp();

2757

/* Should be run after espfix64 is set up. */

2758

pti_init();

2759

kmsan_init_runtime();

2760

mm_cache_init();

2761

execmem_init();

2762

}

2763

mm/mm_init.c - stable | Gitverse (2024)
Top Articles
Latest Posts
Recommended Articles
Article information

Author: Aron Pacocha

Last Updated:

Views: 6038

Rating: 4.8 / 5 (68 voted)

Reviews: 91% of readers found this page helpful

Author information

Name: Aron Pacocha

Birthday: 1999-08-12

Address: 3808 Moen Corner, Gorczanyport, FL 67364-2074

Phone: +393457723392

Job: Retail Consultant

Hobby: Jewelry making, Cooking, Gaming, Reading, Juggling, Cabaret, Origami

Introduction: My name is Aron Pacocha, I am a happy, tasty, innocent, proud, talented, courageous, magnificent person who loves writing and wants to share my knowledge and understanding with you.