mmap_avi 99 0 (a Aba FLA.
913 /* Insert vm structure into process list sorted by address
914 * and into the inode’ s i_amap ring. Lf vm_file is non-NULL
915 * then the i_shared_lock mast be held here.
96a
917 void __insert_vm_struct (struct mm struct #mm, struct vm_area struct *vmp)
+52.2 twee
gg
919 struct vmarea_struct **pprev;
920 struct file * file;
921
922 if Cmm->amap_avl) (
923 pprev = &im->amap;
924 while (prev && (pprev)—>vm_start <= vop vm start)
925 pprev = &(pprev)—>vm_next;
926 } else (
927 Struct vmarea struct prev, next;
928 avl insert_neighbours(vmp, &m-amap_avl, &prev, &next)
929 pprev = (prev ? &prev->vm.next : &mm->mnap) :
930 if Gpprey != next)
931 printk("insert_vm_struct: tree inconsistent with list\n”) :
922 t
933 vmp->vm_next ~ *pprev;
934 pprev = vmp;
995
986 mm->map_covnt++;
937 if (am->map count >= AVL_MIN.MAP COUNT && tmm-Snmap_avl)
938 build mnap_avl (on) ;
939
99}
AN MEAE TTD x TELS CREABE AS, TER EBA SP SREB OR EAS TI TL ASG ED,
SE. AVL AM fd “11s ff) 38CHIF ACS] AVL_MIN_MAP_COUNT, &{! 32 IM, $08 250812 build_mmap_avi()
SEXL AVL 4, DREAD RACE T
2.4 ARF I)
TAF le EAL Haag RSL) eG
‘He » WM SRS ASD ES ep a A TTT AE CPU ACR A FD SUP DN PP 7, RECT
FG NADAS te BRED FER. HEM CPU rR ATA (Page Fault) S#%5 (Exception) (th
RRORIUP IT), SRM RUAY PON A OCCT FR AS REI EN PPE A A WRT AA TO OT SAT
eT. seattiT SARE. HET BLOT ET LLU PLR
© HALAS TU RN TR Ae I. HA ALR tht 5 Oe hk UR eR ARE, ak
BUSA.
© AMOR RAE ATE
© ROPMENTALASRMMRMAY, PLR AS—T “AR” RT.
RMA, RTE BAP RA PATIL Hat mmapl id Fs BY Fe,
A ROAR GILL munmap( ASEH). ZENER -MBRAL PCIE, FSH ete esta Ea]
SP RAMI ASI, TAUB EE AY. PU, ALR, Bt
+53Linux PARE Li
ERLE EA HOHE AR UG HK RAT) ESR, RARE APD. RN,
ETE —P EREHE CInvalid Address) THiS {HBBRAT AW, IMT BRA A:T — UC TU th EP
"PUTA A RP AIMEE “SPIELE” Roop eH, AEA ALA LR
SAA MIR SEM EM. ARBRE CPU MET EBA TRIB IRS AE
do_page_faul HAT.
RAK do_page_faul( )AIARIZEM FF archi386/mmvfault.c FP. XA MMV, BAST
‘ESE RA PE HIE TA 9 HY EM IERETAIUTRB:
108 asmlinkage void do_page_fault (struct pt_regs regs,
unsigned Tong error_code)
7
108 struct task struct #tsk;
109 struct mm_struct nm;
10 struct vm area struct * vma
ul unsigned long address;
112 unsigned long page;
113 unsigned long fixup:
4 int write;
us siginfo_t info;
116
“7 /* get the address */
118 _-asm__(“nov] herd, 40": "=r" (address)) ;
119
120 tsk = current:
121
122 is
123 * We fault-in kernel-space virtual memory on-demand. The
124 * ‘reference’ page table is init_nm, pgd.
125 *
126 * NOTE! We MUST NOT take any locks for this case. We may
127 * be in an interrupt or a critical region, and should
128 * only copy the information from the master page table,
129 * nothing more,
130 ”
131 if (address >= TASK SIZ2)
132 goto vmalloc fault;
133
134 wma = tsk~>am;
135 info. si_code = SEGV MAPERR;
136
137 ie
138 * Lf we're in an interrupt or have no user
139 * context, we must not take the fault...
140 */
141 if (in interrupt () || !mm)
142 goto m_context;
+54.BIE fra
M43
144 down (am->mmap_sem) ;
145
146 yine = find_yma(nm, address) ;
M7 if (tvma)
148 goto bad_area:
49 if (vma->vm_start <= address)
150 goto good_area:
151 if (1 (wma->vm_flags & VM_GRORSDOWN) )
152 goto bad_area;
TET. AA EFA A? 4 1386 CPU ree “THE” SPRAY, CPU HER BCU AT
SR AR es aL EAS Hh AEA CRIP, MA PARSE eM. A, EC
SEP REA AADUADIE TF ALS) BT CAH BER CR2 AIS, BLL SLMEFEN WRT. ATIC RS A A aH AL
TO RAMRA RS, “EHE%O FREE address B28 A, JERSE MLN Bea OPAL — PAT ET 6
RR, ARM / RRMA ATS. ME ptregs MHUHtt regs, Cast
REMY CPU PH RETR AE OP AR, CE LP RL BL ER POR AD DR"
error_code it~ 25-45 BWI 6A FLA BU»
JRC TTEAR AN task_struct SUR AY. AE BT DUEL RE current IRA Saint
PE CHBTE TEDL AT MEAL) AY task_struct if (MMW. TERE REREAY task_struct SiH MEET, 1H
FAS mm_struct SARS, TERA EAU A AT AEB Pe. QU SEE, CPU SchR
SAT ARAN IE AN mm_struct SH, hd AAR OL AT EL A ALA at on A RA Fe AT,
mmm_struct SiH )RORT , SUPT BEARLE T PRET.
PERS, BER MEPARIS. — (RPTL AL in_interrupt( )IBIELF 0, BEAAWERTBN A Wea A Ze
FAREED, A SARA. Th MTD STRELA mm THOSE, at
ABREU MARES, MNRAS AD BE SATE. WHE, AER AUREL AX, in_interrupt( )
WEE) 0, MKT REE TAMIR? MARRERO / RRS, RAAT
in_interrupt( )EKU PITA PTE MRR EIS ATA OL HEAL goto HATH BART no_cotext
Sh, BARS RGR MEIER, ATER ABBE
CURSE EA HRW EER, AURA RMR ATDL, PDR atte SNS PIV HtP. BD
down( fup( RTI GRIE. SIA HAY, Zé mm_struct 44 PE LET AAS mmap_sem. ik
FE Medown( iB IFIDUS, ®t 72 47 HARRI T -
AMER, ARISE TSE AR KI OS Sak Be SA 5 Be RS I
DATTA BE re PN OE fa), RE PE). SIE,
find_vma( ) BFS AATSHF. LAAT BEE. find_vma( yet FACE sie AP PH 24 ka A
R-AK, WREATH, DANA SAYER MTs. BA. HPAI PS
HARBIIE? PCF ABO EE ARREARS TILA, LR, TURAL ROES
BBR ABE A Riel ESET. UR A 7 nd Tae, A A
JEEZ b, BAA IG TET. BMY RT RT, BORER AT. We
SLAIN bad_area, Asii$ (1138 MARR ATRL AIA IK MAT 6
to RALH AA AN, AL Pe ars Me Pee Ms COLA 148 77), ABR HE
OGAWA RR TERME A AE, BUNT SE ZA, DITLABRSE IA good _area JE — 2 He UU.
55.e240 Ite
CSS Saeed
Linus Pei
RARER PITRE BS.
TRS, ATA a TRS PAE A, AR SL TE TED
RARER OLHA, TAREE, TRAP AM SI. R-HSMAREA—T, IB
RAE EMMA, CRA taba A Ae OBR RBM brkc( )) TDA SHAE 2A Ih),
ABU ANCATD HLS TET, AMER. RAE MAPLE, (HUE,
SHA AUER AIR TEIN? ALE 150 tr. ATAU, JERR PURER, oe
find_vma( ARB) R i] HERE fe, BALE EA vm_flags PWIA Mardi, VM_GROWSDOWN. &
FEARS 0 BIA, ABET SSI L Fy RODE IAD HAE, RBH ASST AR MD Ue
AMA PH, RA eee et T — Pebble A ARE. RRMA RT BLL
FRU, RANT MBH AT IS BAY goto 1 A)H: fH) bad_area, BUEE 224 {TF
[do page_rault()]
2200 /®
221 * Something tried to access memory that isn’t in our memory map.
222 -* Fix it, but check if it’s kernel or user first.
23 #/
224 bad area
225 vp (éom->amap sem) ;
226
227 bad area nosemaphore:
28 /# User mode accesses just cause SIGSBGY #/
229 if (error_code & 4) [
230 Usk>thread. er2 = address;
231 (sk->thread, error_code = error code;
232 tsk->thread. trap_no = 14;
233 info, si_signo = SIGSHGV;
234 info. si_errno = 0:
235, /* info. si_code has been set above +/
236 info, si_addr = (void #)address;
237 force_sig_info(SIGSEGY, Ginfo, tsk):
238 return;
239 1
Hh, SRA A, CAPAMBA SE (BARAT mm_struct SMUT HRIE), DILL
Lup HAR. A, ME ER errorcode, HAAR ARB RUBAER BIEIN TPE
we:
96 fe
97 & This routine handles page faults. Tt determines the address,
98 * and the problem, and then passes it off to one of the appropriate
99 & routines.
100 *
101 * errer_code:
102 * «bit O == 0 means no page found, 1 means protection fault
+56.Ble Taw
103 = * bit 1 = 0 means read, 1 means write
104 bit 2 == 0 means kernel, 1 means user-mode
1050 ¥/
“4 error_code BJ bit? 2 1 BL, AeA RMAE CPU AEF HU RESO EEN, BOE SRT RATT.
PEARSE REA 229 17. CEA, NY NRIMEREIN task_struct HAM RR, a
Ba NaN AE” RAK RHI”) SIGSEGV. ilk, AUS i ART
ER ARIS: “BARES? "ARN, 56TH ORM EA TB et A SBE
TORRES WA. SUOMI / RPBPZLH, PECANS RM ARM fe SEE,
ERAGE MARE YREAAM. HepAebA—TAA SIGSEGV. Wis, ARMIES Abi SAUTE
RUREBAR IPE EAI. LER MMM Re “WR” OY, APE RIAD. AT
SIGSEGV #55057, SLB, SUB R ALTE RAM GPL Sa AM AS BEAD
“Segment Fault” #243, #R/G MEEREME CHRD. EAS Abu BAL ein AA, ZEA
REFARX, MAKRATEE So
FANE BBE F do_page_fault( FSG, WMA RBS RATE PMT EMRE
Aids DUPE RUD RF BAIS ELBA CS HK
25 AP HRT
EL- MARE, RUT URS” TA li RM TS [debe
BA, RARHOBERD, MARANA RN. Be. REE ARP. ERR
RATA PB >. HUE PIER ITT “AA” ARERR. TAR, a
MR] Pa “Mad.
RSM AET RP, CAMS T AA OR PT, CRI “UTR” HE
COLE, HERE TY BRD, BBVA FCAT ROMER STINT PY. BRR, CPU aie Beast
esp DHS EEE MURS HEH, 26.
— Ri al
eA
wes >
, =e | meee
aR
FB BL
H26 Resa r ee
AEE AEE RAPS, SG CPU SB Wak HERR, ta Ad BG lg Ate
SIP HE CSbesp—4) AOMKIT. AT AL, AERRATR- MAP IEE (Soesp—4) TRA Sa, BAL
-57-Linux Pa, |
RBA OSE PA a ES [a — CT GPE ULB LM BE LO RB I
arch/i386/mnv/faultc ((3% 151 47.
{do_page_fault()]
151 if (1 (vma—>va_flags & VM_CROWSDONN))
152 goto bad area;
153 if (error_code & 4) {
154 io
155 * accessing the stack below esp is always a bug.
156 % The “+ 32” is there due to some instructions (like
187 * pusha) doing post-decrement on the stack and that
158 * doesn’t show up until later.
159 */
160 if (address + 32 < regs~
161 goto bad_area;
162 }
163 if (expand_stack(vma, address))
164 goto bad_area:
FA-UK, PHR_E ARYL LAL AEER ET, 3 VM_GROWSDOWN fast 1, BTA CPU MRMRSEAL HT
BUT. SARE AWCRETERU ASI] (bid 1) Wt, PERERA SAAT SEAL fe ROLE AER,
FL FB BR EA A AS HER ET TEAL TT» AERA MA EH, ALA Sbesp—4,
RR RARE. (LEM RL Gesp—40 FE? AAA TE RET S|, TPR
SHER T. ALE, GARAGE “TER” BONER? EH, RAR 4 4, BLL
HLA AL Gesp—4. {HAZ 1386 CPU HA pusha $4, AL — ete 32 PEW (8 32 AB
AS) IRAE. BELL, KYA RTMENUE%esp—32. MIR MIRE AT, TRUER Nt
Rb AE, FeAl bad_area. TAA MA RP. KMRL MALL
REBT EBERT ER, ARIAS BHT LAE OR. HE ZF
AAERURIE, (ESU@DUT RE. FFDLAESEAH] expand_stack( ), GREAT include/tinua/ymmb 112 09
—4S inline AR:
[do_page_fault( ) > expand_stack( )}
487 /* vma is the first one with address < vma->vm_end,
488 * and even address < vma~>vm_start. Have to extend vma, */
489 static inline int expand_stack (struct vmarea struct * vma,
unsigned long address)
490
491 unsigned long grow:
492
493 address &= PAGE_MASK;
494 grow = (va >vm_start ~ address) >> PAGE_SHIFT
495 if (vma->vm_end ~ address > current—>rlim[RLIMIT_STACK]. rlim_cur
496 C(vma->vm_mm->total_va + grow) << PAGE_SHIPT) >
+58.BIS teow
current—>rLim[RLIMIT_AS}. rl im_cur)
497 return ~ENOMEM;
498 vma->vm_start ~ address;
499 vma->vm_pgoff -= grow;
500 vma->vm mn->total_vm += grow;
501 if (vma~>vm flags & VM LOCKED)
502 vyma->vm_mn->Locked vm += grow;
503 return 0;
504}
BH va HEI —F vm_area_struct BUBBA, (RAPER, CE (Ee STO HE
ACEC m). PI. MORAL TCT AER Fr, Thee Re BEAR JL NUTR A Ee EO CE
HED), ABRAM ala, HAT RAE MiB, ABR TAlp At Re Se if oe LL?
FILM). AEARAY task_struct HHP ABA lim SRL, HE TR ENR CO.
RLIMIT_STACK @& 23 41° 2200) BOA DESMA. ATLA, ROME TIA PEON. AOR REC
fant TAT THR ORE, SAAN AT RCIA RUN. RL TTA ea. BRE
AMET» edeiklel +) ABU CY—ENOMEM, AAT EAT ATLA RT : ARR EEL
0. 4 expand_stack( )i&[EI(K{H294E 0, thiI—ENOMEM It, 7 do_page_fault( ) 2414] bad_area,
HSER SH ROPE. Aik ALR FAAS ARR, FLL expand_stack( )—BUAWEIE HE
Fl. (AE, RACH, expand_stack( RAAT HLA vm_area_struct 444. ba AE
Bea AY TAY EEN FERS. IX MES HH PAY good_area TER:
[do_page_fault( )}
165 /*
166 # 0k, we have a good vmarea for this memory access, so
167 * we can handle it..
168 #/
168 good_area:
170 info. si_code = SBGV_ACCERR;
I write = 0;
172 switch (error_code & 3) (
173 default: /# 3: write, present */
174 ifdef TEST_VERIFY_AREA
178 if (regs->es == KERNEL_CS)
176 printk (“WP fault at €O8ix\n", regs~>eip) :
177 dendit
178 /* fall through #/
178 case 2: /# wrile, not present. */
180 if (1 (vma->vm_flags & VM_WRITE) )
181 goto had_ares:
182 writer;
183 break;
184 case 1: /* read, present */
185 goto bad_area;
186 case 0: /* read, not present */
59.Linux Pei ea! ff
187 if (1 (vma~>vm flags & (VM READ | VM EXEC)))
188 goto had area:
189 }
190
191 i
192 * If for any reason at all we couldn't handle the fault,
193 * make sure we oxit gracefully rather than endlessly redo
194 * the fault.
195 */
196 switch (handle_mm_fault(nm, vma, address, write) {
197 case 1:
198 tsk->nin_flt++;
199 break;
200 case 2:
201 tsk->maj_flt++:
202 break;
203 case 0:
204 koto do_sigbus:
205 default:
206 goto out_of_memory:
207 }
TAAL switch 16 4)71, Ay BL ee TEL ASEAN error_code Rath: ~ 25 AH ERRATA
FADIFE RAAT Cerror_code MERU PA). RRA MAST A, bit
AO, SRAM, Ti bil 1 ARSE. ML, RAMA 2. BEAR ETE, ASE
HANH KARGRSA, TERRE RASAM. Fit, MHMAT 196 tr, WAL eR
handle_mm_fault() 7. 28802 XT mm/memory.c “}:
[do_page_fault() > handle_mm_fault()}
1189 /*
1190 * By the time we get here, we already hold the mm semaphore
1191 */
1192 int handle mm fault (struct mm_struct ‘mm, struct vmarea_struct * vm,
1193 unsigned long address, int write access)
194 (
1195 int ret =
1196 pad_t *pad;
ug7 pmd_t *pmd;
1198
1199 pad = pgd_offset(nm, addres
1200 pnd = pnd_alloe(pgd, address
1201
1202 if (pnd) {
1203 pte_t * pte = pte_alloc(pmd, address) ;
1204 if (pte)
1205 ret = handle pte fault (mn, vma, address, write_a pte):
60.Tiny
ean
28 tener
1206 }
1207 return ret:
1208}
PRA S20 PU LHL AAR ea ETD) mm_struct BBA, ABEL ped_offsen( it MEAT
ILA TL A RORWEET. LCORZE include/asm_i386/pgtableh PIE LAI:
311 /* to find an entry in a page-table-directory. */
312 define ped_index (address) ((address >> PGDIR SHIFT) & (PTRS_PER_PGD-1))
316 #define pgd_offset (am, address) ((mm)->pgdtpgd index (address) )
ET FIRM pmd_alloct ). ASEM ME OR AFA) —MP TAT. F386 BPS
WRT, BILE include/asm_i386/pgtable_2level.h P45 ILE Mh “return (pmd_t *)pgd;". WALAEWL, 7F 386
CPU FE, EFA RHP AH CAAT 1D ANEPIBLER. BALL, AF 1386 CPU
WE, pmd_alloc( )EAEAL AWAY, EDLY pmd ATT AEN 0. BER ARH SHE ALL Rowe EY
BAL A PRBEWM A? WMARGREN, AMM ASML ORI — TTR, seta
ARABS SE MOHANL TE AeA BUNA SEL. RAE, ROT TA, APR RE aE
ATR, AAEM ARAL VOR. RAE, PHT Ly Fd sm Ps op SE IP
HES. RALULAL pte_alloct )7EaRIM, SL/QH9ZE include/asm_i386/pgalloc.h *t:
[do_page_fault( ) > handle_mm_fault( ) > pte_alloc( )]
120 extern inline pte_t * pte_alloc(pmd_t * pnd, unsigned long address)
wf
122 address = (address >> PAGE SHIFT) & (PTRS_PER PTR ~ 1);
123,
124 if (pmd_none (4pmd))
125 goto getnew:
126 if (pmd_bad (pd) )
127 goto fix;
128 return (pte t *)pmd_page (pnd) + address;
129 getnew:
1300
131 unsigned long page = (unsigned long) get_pte_fast( ):
132
133 if (page)
134 return get_pte_slow(pmd, address) :
135 set_pmd(pmd, __pmd(_PAGE TABLE + pa(page)));
136 return (pte_t *)page + address:
137}
138 Fix
139 —-handle_bad_pmd (pnd) ;
140 return NULL:
Mt}
“61.Linox PHBE rats
See (Mo MLE Re et CR TU EP aR ak. ARAM Re ABET pd 9S I) Ha
WAS, HMB SS gernew( bare PMR. — TM RA AHEM MOM.
AP RRMA AME TAG. WR MRT, A BOF IRET ML eR TTT
V5 SAE eM JA RA BD AR es
PERL. RPE, GBA AP ULIMTARINE, BERT ULE FREI, BAL get_pee_fast( ). BURADHRLEL
BST, MARAT get_pte_kemel_slow( KET. LABIA E, BALM PEER
TRAM A MA, aA ZIE “slow” ML (oA LAT MTA 2 AAR. FRE ROU TAT
ARSE, MALATE AE A NAM TRIM Le LT LT 5 SME RDU
MAL set_pmat THEIRS UAMMENEIA) LEELA. ARS AHP FLSRCHL pd TENE 1886 RISER
LAST WOME SOR ped 4. AE. WH STAN) “AEE” AB CLEETR AT 5 (A ICT AT pe ALE
fy. aR SPRAIN He AAC P.M HB handle _pte_fault( VEAL. APRBGE LP mm/memory.c
A:
[do_page_fault( ) > handle_mm_fault( ) > handle_pte_fault( )]
1135 /*
1136 * These routines also need to handle stuff like marking pages dirty
1137 * and/or accessed for architectures that don’t do it in hardware (most
1138 —-# RISC architectures). The early dirtying is also good on the 1386.
139
1140 * There is also a hook called “update_nmu cache( )” that architectures
1141 * with external mmu caches can use lo update those (ie the Spare or
1142 * PowerPC hashed page tables that act as extended TLBs)
143
1144 # Note the “page table_lock”, It is to protect against kswapd removing
1145 * pages from under us. Note that kswapd only ever removes pages, never
1146 adds them. As such, once we have noticed that the page is not present,
1147 * we can drop the lock early.
148 ®
1149 * The adding of pages is protected by the MM semaphore (which we hold),
1150 * so we don’t need to worry about a page being suddenly been added into
1151 our VM.
152 #/
1153 static inline int handle_pte_fault (struct mm struct *ms,
1154 struct vm_area_struct * vma, unsigned long address,
1155 int write access, pte_t * ple)
1156
1157 pte_t entry;
1158
1159 i*
1160 * We need the page table lock to synchronize with kswapd
1161 * and the SMP-safe atomic PIE updates.
A162 =/
1163 spin_lock (&mm>page table Lock);
1164 entry = pte;
62+Ble Tee
1165 if (pte_present (entry) {
1166, i*
67 * Tf it truly wasn’t present, we know that kswapd
1168. * and the PTE updates will not touch it later. So
169 + drop the lock.
1170 af
71 spin_unlock (&mm->page_table_lock) ;
nT iff (pte_none (entry))
1173 return do no page(mm, vm, address, write access, pte):
474 return do_swap_page (nm, vma, address, pte,
pte_to_owp_entry(entry), write_nocess);
75 }
L176
ugT if (write_access) {
1178 if (Ipte write(entry))
1179 return do_wp_page(mm, vma, address, pte, entry);
1180
181 entry = pte_mkdirty (entry);
1182 }
1183 entry = pte_mkyoung(entry) ;
1184 establish pte(oma, address, pte, entry);
1185 spin_anlock (&mm>page_table_lock) ;
1186 return 1;
ust}
ARAVA TD, ARTUR AR HRC RAAT ARIGUEE) IG ACE ALA,
He, PLR TTSL A if EAI: REREIHEAL. [1% pre_present( )ik- ATIF WR ALIN LAF CE PS
TPR, CRIN Ae SL LEIEATINNE. DE 2. ple_nonel \PFWIANIA ACHE. EIA
MPRA WR. HALL, BREA do_no_paget ) (2 RUBLE do_swap_page( )). Mitt F.
WAL pte_present( VIMAR LAR TPT A UTR EP eH AZ ARE SEMA EVAR, eh
RARRT AT.
FBR do_no_page( HLAEZ: mm/memory.c 45% UN). OEM EAA, IR RIC RE.
Dh a ARANDA EBA LHR ae AF BLD A AY vm_area._ struct PAF HE vm_ops He) =F
vnoperations. struct BUBSEH). A-MHARGIGSEIR IAL MEBLBESER, ARMIES ASCE
HPAL RAO RARE. SLII-A) (SB MAGE OR EDR A eT RE, YORE A TL GALI a
SCPE TE? BLOF mT RRSP RICA SUN. Se TUES FPS
HRESIEA, TIT REO RRL. AS MRS ART
TLE BINED BAIT ALBIA, AN “copy on write"R COW. XF COW seit] AMEE — AHS
fok BER EBON TEA TH 2B. EAE. SBLt mmap( ys — SRA PRMD — PERTTI CELI BLD
RRA. BRAT ULM Sci TRA EROTICA. een Me
EAVAYSCAPRUUINIBE( LE. 7) Ju thi, 5B 0 SL AA LA A ERAS. DEEL. aR
EE AGS AO COR ER AAAS, TUR, AOL BU Sa wma
SET DL VILIN Te TD BR YF B.A AL vmavm_ops->mopage( > fH, vmasvm_ops Hl
vma->vm_ops->nopage AMA BTARAL, MARA ASAT ZANTE ALM nopaget ERE, SCH RASBLE Ai
63.Limo py Heit (ab
fi &—/> vm_operation struct 4 #4). “HEAT HEE MY nopage( ) MEE IT A Beat A — 4h a Bt
do_anonymous_page( )74) AO YE 4 17 Ki.
FREER H do_no_page MIFAILAT:
[do_page_fault( ) > handle_mm_fault( ) > handle_pte_fault ) > do_no_page( )]
1080 /*
1081 * do_no_page() Uries to create a new page mapping. Lt aggressively
1082 tries to share with existing pages, but makes a separate copy if
1083 the “write access” parameter is true in order to avoid the next
1084 > page fault.
1085
1086 * As this is called only for pages that do not currently exist, we
1087 -* do not need to flush old virtual caches or the TLB.
1088 *
1089 This is called with the MM semaphore held.
1090 /
1091 static int dono_page(struct mm struct * mm, struct vm area struct * vma,
1092 unsigned long address, int write access, pte_t *page_table)
1093 {
1094 struct page * new_page:
1095 pte t entry;
1096
1097 if (!vma~>vm_ ops || !vma~>vm_ops->nopage)
1098 return do_anonymous page(mm, vma, page_table, write access, address) ;
1133}
MF RALE MER RE, APRN MTA, IRA MERA TARR,
TR 2275 HEH) nopage( BRHF, fITLLIEA do_anonymous_page( )»
[do_page_fault() > handle_mm_fault() > handle_pte_fault( ) > do_no_page( )
> do_anonymous_page( )]
1058 /*
1059 * This only needs the MM semaphore
1060 #/
1061 static int do_anonymous_page (struct mm_struct * nm,
struct vm area_struct * vma, pte_t *page_tablo,
int write access, unsigned long addr)
1062
1063 struct page *page = NULL;
1064 pte_t entry = pte_wrprotect (mk_pte (ZRRO_PAGE (addr),
vma->vm_page_prot)) ;
1065 if (write access) {
1066 page = alloc_page(GFP_HIGHUSER) ;
1067 if (page)628 tame
1068 return -L:
1069 clear_user_highpage (page, addr) :
1070 entry = pte_mkwrite(pte mkdirty(mk_pte(page, vma->vm page prot)));
1071 mrss};
1072, flush_page_to_ram (page) ;
1073 }
1074 set_pte(page_table, entry);
1075 (No need to invalidate ~ 1 was non-present before #/
1076 update_nmu_cache(vma, addr, entry):
1077 return 1; /* Minor fault */
1078}
FARMER A, WRG RMR SEL, ALA mk_pte( RISTO 2 Bt
ple_wrprotect( )IOLAWSTE; lu RHE BRE (BAL write_access 41: 0). WUiKLLt pte_mkwrite( HULL HELE.
ROAWHAAMWE? I includelasm-i386/pgtable-h:
217 static inline pte_t pte_wrprotect (pte_t pte) \
( (pte). pte_low &= ~_PAGE_RW; return pte; }
270 static inline int ptewrite(pte_t pte) \
{ return (pte). pte_low & _PAGE_RW; }
ATHE— F, SATA, ZE pte_wrprotect( jt, 42 PAGE_RW fraviritek 0, aK MEER
AVR: TCE pte_write( MIB PRA MBM 1. AEE eR A. FR AS 2 dt
ZERO_PAGE, i& 47TH EE include/asm_i386/pgtable.h 5 Xi:
a ie
92 ZBRO_PAGE is a global shared page that is always zero: used
93 > for vero-mapped memory areas etc. .
OY
95 extern unsigned long empty_zero_page[1024] ;
96 #define ZBRO_PAGE(vaddr) (virt_to_page(empty_zero_pege))
LAER, BEE RE” CHR OR) BATU. FPA eh — ff a Fa — 4 EN CTT
empty_zero_page, MUA PICEA Abu ALT 20 SERRE, TIM AA TEAS 0, APLAR ZF ANI
RY HIRE 0. NAAM RR. 71 alloc_page( HABBO AEA EE. ERATE MER
SA, PAPI NAR THER. IF HALES BEA SRS M4, JITLIBEIBL alloc_page( )#It4}E2—
ADEA FF DUEL, FFAG A MLB] A HE TOE I] PA PARAS Ae AWLP L115 47), sRLIL set_pte( )
RMAH pagetable PHIM. St, Maer TL SIN Ae OI ee Pa Tk
£9 update_mmu_cache( )#f i386 CPU #2 8% CJA. include/asm_i386/pgtable.h), [33 i386 fy MMU (A
FPG IL) AIGRAE CPU ANAK, TIAA MMU.
RURAL. PRRABRBMT. ATR, SEK PM RAMS 1, BE
do_page_fault( ). 7F 3% do_page_fault( )4#, i834 - +45 VM86 BECUAR VGA MARTE RAK
AIR. (EA SRA ROR KAT
+65.Linux Path r
[do_page_fault()]
209 is
210 * Did it hit the DOS screen memory VA from vm86 mode?
21 */
212 if (ogs-Deflags & WMMASK)
213 unsigned long bit = (address ~ 0xA0000) >> PAGE SHIFT:
ald if (bit < 32)
215 tsk->thread. screen_bitmap |= 1 << bit;
216 y
217 up @nm-Dusmap_sem) :
218 return;
SUES» APM 81 CPU AKU TCT GA A BPE, AG SS AIT IAL
Mer GR AAT ABR, GARE POUT, LR. PRA RR YR
iH, PRU AM Crap HHS) AEN, CPU MH TE AIS. AAR Poe ak dT Ae
Apt RE Ba A SS AS eA HEAT I). SRR AEA, CPU HF TG a
OPRDL O, BEAR, EAE) TAME A NAL CAV FAS RSL) AHR. GES
BEATLES SE Mh EE I SERRATE A NL. JM AREE REE CPU RUA ALE P SLD, EAN
HEP BL. DORE UE, TIN ROT IE” ALATA, BZN TLE ACL. ARM
4, SURBAE WAS PRE, RT CRAMER RCM ESN. MRE
WEATBRT. FPA EH ARR COLOMBIA Goesp JEAIIER RUT). UTE. MPR
EER, HERR UAT, HUT LAMAR Bho) LA ae
RTT. PPR, RAE OAM" I, SRT AR A, TORE
MFR ROS AACE T ALG AST] FES
2.6 AER Hh ay 1k A Fo Pe
BRCPU 24h. SFAR Linux LOR ABA RER IE RSC BL, PORTER TCIM AY CGE Rae AR EE
HOT. DRAMA MERAY ER EEO PEA AEE. wt, &
PRR EA Ee eT Bite
HARRIE AB I MERINLA ARIE. “RETEOUN”, AR ALMA EE ELE A, UES
RMA (4KB) MAIDA. RAE TURBOS SAL, A ET SE TE AT
Ly BSL “PHT”. KURA RIAL, PT LEA, te ede b. T
KARMA, ATED MARZ “CED ee i” AU “CRE CRB) SU”. ELSh, AEE
MRL, PERERA bh, AE TREAT TR, RETRIAL. BTL
RATER ER A AE APD SCAT HT AOR, SEPA LARNER SP eae A TT AA LL
URAL, UE, RSE AL PLB, aR ICE
SUATBT. RE UEPERUNE TE EIA CALE I) y 3GBD. Bit, tush LEIA
WENGE, BASALT MB. HAE, F285 Linux COL Unix) BUSA ER API AG LEB
1S, ARLE KB R= KB. nT, SRB AUL TS. LSE REIRIUT Te HRT OR, RE PEATE]
66.RIS Thome
ARERR T . TORRID F, SW RYN ALI RRR. BELL, Ee ALR
RR EIRE RA TARAS —* SRB i ER” AR, NIMH CNBR)
APM LE, UF EIA), STREAM EERE. RR ee
AREER EIA Li, SER AGE THOR SL AT LUE CRA BE)
FORINZ GRICRRE BUD, ACPI “CRB, BABE SUPRA TTT FEAR. GUN, RRS
BERRA, RAE REM RUA HO BEK, FS A Ae RS TM HE Th
RMR” RR.
TERE RY, MACE AL, BREE ATP RA, AMES leat
1, AMORA. TE, WERT, BAH, BARRA ET
Zs PARAS BRR, ARATE, WE RRO ha TR
MOAB HE. BUG, Linux $k 7 ADIT ASA LY RAL, ALBA ER
PEE ETF.
PEST He Se FL 9 RZ SEG I «SA A A Heh PE
MHA MMK SHE, ATE CO) ARTE, RTA ABNF page He
HA. SMA Hi page HAR CURA PE AH task_struct 4H), BATRA
AZ PL BRR RSE” PR, MIE TEENA, GRA, AEE RELA
EM. FR, ELE TPTERU TER, WRT MHEIOEIN page 6, RMA RRA OB
A)". REALM, ARRESTS, yeh CTAB AE 4 page SiH,
Te TP page HARSH BAL, Hk — 4a Fei mem_map RABE. ALY, RE MS HE
REHEAT BR”, PRR AE ER” Cone). ii
ARM LE TARR, MRA GTM PRED. RK, RACE ASLT.
SIAL, BER AC HARE, AT TLC) AA a EN
WEEE RRC, SHRM RAS, SHANE THM, REBAMAS LRA
HE, ARAL AAP ERAT en. Rae A ae ORT. AE
XT — swap_info_struct BAR, A LUAAAUT ALP me OR EE RE
include/tinux/swap.h
49 struct swap info struct |
50 unsigned int flags;
al kdev t swap_device;
2 spinlock_t sdev_lock;
53 struct dentry * swap_files
St struct v'smount *swap_vf'smnt ;
55 unsigned short * swap_nap;
56 unsigned int lowest_bit;
ST unsigned int highest_bit:
58 unsigned int cluster_next:
59 unsigned int cluster_nr:
Cy int prio: /* swap priority #/
él int pages:
62 unsigned long max;
6 int next; /* next entry on swap List #/
67.Linus AIRE, AN
64:
Shinde swap_map fib] MAL, APH TEE SRE Ree EE
"> AMET, TALK Pew a Te A oR SLA. BAHL KAN pages,
CRAAAHRABSM AMID. Be REE. Be Ace, RIAD) mu38 hott
LB swap_maplOVHHRM BT RORY RMR, CLA T ARE Ae
Re AAEM 2 ETL 0) RAED BPD 5 Sek BARA EH HN A CT EH RA
AMMAN ZRENA ARRA), EA MATH HTM SR. ROM RT EE
SMA MHB SN, BEA swap_info_struct 244 5F 9 lowest_bit #1 highest_bit MVS PM TARA
FARR & by eee R.A max RAR RRA,
PELE RL WORK YD.
EDL FF RST EAS UREA GE AS FE aD HS ATT
PRUE SPALL TUS dA Rm REAR AE (cluster) A ALMET. 10 BL cluster_next ll cluster_nr HE
JRE.
Linux ASIEN S + ARS (BOLD, PLES BP REIL T+ swap_info_struct 4549
PESY (AL) swap info, 1X Z4E mnv/swapfile.c Pa LAN:
25 struct swap_info_struct swap_info[MAX SHAPFILES] ;
Felt, BEL IZ TANGA BA swap_list, #544 AT UA 4} BE OEE WL A BL ee HE swap_info_siruct
SMR Se LE I TE ik
23° struct swap_list_t swap_list = {-1, -I};
JX 19 swap_list_t Sti 44 #4 2 4 include/linux/swap.h Ps LE:
153 struct swap_list_t {
154 int head; /* head of priority-ordered swapfile list */
155 int next; /* swapfile to be used next */
196};
FHENB AZ, BTA head AM next 4-1. SABI swap_on( RRA TITER
FRI, ESCA AY swap_info_struct 4 MBEABSIE
DUQUE pte_t MRSA MA) HOBAGRA SHERMER A-H, LMA
4—/ swp_entry_t BH SA, CEE include/linux/mmh Pie LEA:
a of
9 * A swap entry has to fit into a “unsigned long”, as
10 * the entry is hidden in the “index” field of the
11 * swapper address space.
12 *
13 We have to move it here, since not every user of fs.h is including
14 nmb, but mh is including fs.h via sched .h :-/#28 Gaoe
15 */
16 typedef struct [
17 unsigned long val:
18) swp_entry_t:
FYI, —AS swp_entry_t POE: LR 4S 32 ATE SME. (IE, RAS 32 ONE EP
STB. IV 270
offset type
2A ti 7h L FRU Ri HE O
B27 Re RE
Cf include/asm-i386/pgtable.h 11299 type A offset MMIBA VIALS pte St Z MW KA,
ERT LAER Es
336 /* Encode and de-code a sap entry #/
337 #define SWP_TYPE (x) (CO). val >> 1) & 0x31)
338 define SHP_OPFSET(x) (G). val > 8)
339, Hdefine SWP_ENTRY(type, offset) \
(Gswo_entry_t) { ((tyne) << 1) | (offset) <« 8) 1)
340 #define pte_to_swp_entry (pte) ((swp_entry_t) { (pte). pte_low })
B41 dofine swp entry topte(x) — ((pte_t) { (x). val })
BEE offset SORTER BEER SCP OE, BALSAM ST type a
HRSA ASP, AES). AMR Me HRY SARTRE, IOAN HE
PRCA S CSET 127 MPC, (SEB ARETE, WEAF 127), ©
FAIAKZM type WE? GNI pret HAHA. PATRIA, pte tin WAL 32
FES SM. HARM) 20 RE AEE A 20 fe CEE TTR AR HLA IE 12 fire O.
SATA 4K WEALTHY), SEK 7 NADA LAOS TCE AHL bea. hn ROW.
US, 39%, HOLA Wy type AUR. Til swp_entry U5 ple MERA RACING, KABA.
NEAR, TUR REP AACA pret BUM PARA) 1, RERTURE TER, TRA th
ERS Ae TL UAT SAM Re AT IE AS A, RUAN RT eA)
SEF OER, TEER TA swp_entry_t “428”, AAAI HUET. PLS 0,
ARH AAR TEPS He. TEL CPU AFI MMU SL ERIEAC A ACAI AL, TITRE TRASH BS RCINCLA
FEF Linx AKCE, BRA ESRHE STE SRR LIOR, SHEE PCPA, Le
Ra Auk eae
FRU, SRE ANTE, FORE AN eG TARR: ANE TPT. MY
HOLT WM HAE TT. RA RRR AL, RUPE SWP_TYPECeniry) AH
SWP_FILE(entry).
RRS A AS 5 BR TT ESL SP BALE a it — 2 HR
69.Linux Wine
FAN — FFE — “MH TT 9 08 B_swap_tree( ). Lik TA BUR. Bear eT LRT
EGR RRM. Hee MORE PE movswapfilec TY. ii A Ad ea AY
—~get_swap_page( AER Cth, BEAM BTR.
HHAL__swap_free( OFFI IUT:
Mi /*
142 * Caller has made sure that the swapdevice corresponding to entry
143 is still around or has not been recycled.
4
145 void __swap_free(swp entry t entry, unsigned short count)
46 {
147 struct. swap_info_struct * p:
148 unsigned long offset, type:
149
150 if (entry. val)
151 goto out;
152
153 type = SWP_TYPE(entry) ;
154 if (type >= nr_swapfiles)
155 goto bad_nofile:
156 p = & swap_infoltypel;
157 if (1 (p>flags & SWP_USED))
158 goto bad device:
INR entry.val yO, REGIS ANE BHAT. By Ceti TT ea a ae ICE EM O RAE
SCHR. AT, GUT ATI, SWAP_TYPE ABI] LAE TD SCHR MH ALES , BDSE swap_info_struct
SFYC swap_infol JAALHI F AR. FRA 156 47 LLG FARIA swap_infol | FIR A ABC HEH
swap_info_struct “#4. SCHR BILLS, “Fm HROY OCT T
159 offset = SHP_OFFSET (entry) ;
160 iff (off'set >= p->max)
161 goto bad_offset;
162 if (tp->swap_napLoffset)
163 goto bad free;
164 swap List lock( )
165 if (prio > swap_infolswap_list.next]. prio)
166 swap_list.next = type:
167 swap_device_lock(p) ;
168 if (p->swap_maploffset] < SWAP_MAP_MAX) {
169 if (p~>swap_map[offset] < count)
170 goto bad_count;
sua) if (!(p->swap_map[oftset] -~ count)) {
172 if (offset < p->lowest_bit)
173 plowest bit ~ offset;
174 if (offset > p->highest_bit)
175 phighest_bit = offset:
“70+Bl Fle
76 nr_swap_pagest*;
ur )
1%}
179 swap_device_unlock(p) ;
180 swap list unlock();
181 out:
182 return;
WADA. oftsen fo ESC PH CES AR AS A PC BE OT
poswep_maploiiser EVA Hi (SMEAR EIT TRC, 40% OBER MLACAMER. FIRL, SFCET ML ASM A
‘FT SWAP_MAP_MAX, e382 3X count 2707 LMS EA, BELA ATE SCP 2: count.
SUMGAS) OW, RATTGALSIESERC Ta, I FOREST NRA EZ oh, BRE
‘ARNG MO 4078 FEY FF lowest_bit 2% highest_bit, [FIN 6/ PeA-ALATAR EM sil Bem nr_swap_pages
HUSH. (ERA, AERC TURD ESI LIP RE, EN fee “TS”
ERE, eRe LAURA LASHER. BTL, TES REY.
UT AN Ae ATT a A a aN BT
HEI, BARAAER— MAT EIB, IT page SENT count 4y 0+ Mitt
Spt i SE 1, GRCAL CBRE emqueue( )*P3HLi set_page count( BAIN, RUE WME A
sh.
PAE RM APS mR, TU ENA, IA RT
EHR. SL EAR SEH TE A A eA TA. SEAL AT We TD BT DL
HEH, SEE, FCM IE ih A eee, NR ERT TTT Ne
RB Th, Ce PN ety CLG TAT OYE LNT, HT ZT AY TR IRE OP ART
AO. DR OP SIRT”, RETR > URINAL SEIal BY A, ZEW)
V9 Be SEP A HC
STURT PAE AUER, FAL ea ATRL
© ARCH ORT, MEAL CAEL, BOREL JEHLEL, Diana rAen “éefeHE”. St
TATE IAA I OU ERA Ah EB CH BR, ALAA RSC PR BBR a a
mary.
© TRAD mp NSA ATER.
© HAM MIR AX.
SAL A EDD RP, AEF RT, UB Ae A RS / BLA
FJD BL RR IN AB AS eA, EERE eA AST BE SIL
SE PARES ed ah oO PN AE DU TM AN ER SPAS SP, SSS
f. CHG F, BERARDI: RCMB CE PETAL, BRAUN CHT AR a, RADAR
Ws RGB MRE TA, JEL eT AAR LT es 9 AT PD
Fak Z 5h, PB EHNA EOL LAS, AAR EE, ARSE Boe.
HAASE PY TE ARH 7 PH TET LL)
RR BAH TE REA OPEL, PRLLSC (RATE Lule, TUM A DSS ATA, BRAE
SIR GSR) ARF CFP + EPR. ROLAND
© Perfil aL kmalloct )38% vmalloo( ) PE. ESAS 1 EA Bly FEE TT BLUES A,
71.Linux AH (a 4 et
0 vma_area_struct BURA. LOUIS — HARSH RTE OME, TCA AT
FL. BALA T TREE S TRAD, PURE) AA ET
WP.
© A BeP BLL alloc_pages( )P 02, FRE SACI at fF Jy Fl ASIN fe Ae
FARAH RT Me ETO» DL AL DAR El Ba ad 88 BC HH dP LE
HARASS EAR EIU, DTLA SERINE OP FEI.
ARR BIMEMSEE T . HILSON. SUR AE Si, facile ahd FEA”
{WEBI LURE DUS HUE ROK. QR ITT ORE MCE RT” QUEM LRU BAR, Bit
ESE Sle"; WRAL ARLE SAAT, EARS RMEN A AP
SUCRE, ASME A mb. BOAR ABLES Fee:
© EXPE RARE DR PAE ik 28 SF HY dnetry 14274),
© ICR SR EP RENT fl inode HHH SETa].
© API ARE / SPE.
SEAT AAR OER TR A Ree hk A, RE WRI PAB th EAS
BY, PAD SRAM RRMA T
ANZ Fy RMS I, PERSO HY RM KIT.
SEAR, ROT MAGTUMA RRR, AAT IBSEN TP, JA me FO
WA BIA BIATE DUETS. WRAL ETA ICA ROT AL, LRA PIL fe
LE. AUTH EAB RID. (AA, PEA EE A RB YR
FE GREE”, RAR RCE TTT RATE CRRA IN A, ER
PARTE DAT, PUR AE mT LAI LH TU A PTET, MATT CE RA EU LAR,
RPGR TL POR Be 3S LATED TPT TRS, RT REM, RATE LRU, BIS “a
SRD ABI” Qh. AL, Lea A OU es TR A SRL ARE
FRAT EH MUS RD. A, TATA R AMR, RUE ART
RAGA. ARMOR LAL, MLW T, FR MEME CRE. ARERR
OR, ABP RESE TD RSA AA AE 77 ARR BRA. (RIAL, TSE IK EAA BERET A SE
BE. BARN COUR) “Bray”.
WaT Bi ARAL ALA mY ST A A CR
PPS Ae LUAU REE A ALAA, ARCO OU TM TAS AAA EEA TT es AA
AHR EDU (CP PRAY 0, FEAT ANTE DA), AEA IRAN FE TTIAISEAN TOE TA
AE page HHH TE “RITE “Coache) BAS) CORERAEPTBARI) Py FURLRILM “HRBRIRAR HEAT
TEBORAS”, BURA “BUR” BAL RR". EPA OBR”, WER, MEE
UT AL RAP SEMAT = SCRE MO 7S A Be a SA PTT RR TRE, LN
SL TUT AUT EBA PPAR AL AEN ETT. RK ZR TU SU AE, OR UE
AS, RRAPEN LRAT. RZ, MUR ROLL, PATRIA TEL, OA
POND EDEA AP HAMM RE, BLAZE A, MAA TR TRB T . RATE
PEED FU NY UT SL BUH), OATS A ET A, A BR
ING SSE A HR a POA a TT OST RAS
ROSEN NLA TT DUR Sab Mn RE, JF EL Rate MT AER. ne, MUA
ROAM AML, BEAT DUA IESE nT DL. Ae. EMER PA IE ASA
+72.B28 eine
WE. MRAM RAM TNICUEMA SS GM, BAS OLIN “Fee” BN, hates
RLM AH BLAH. SR, ae A Oat, ta AS ha al
HE, PTCA A TR RIT. Ht Bea) “70” RL” BAS. Mie
BG. FP SH” TE, Wik RAR ARS AME, BSE “TI
WG MER RRA.
SRM, ERATIONS, / Sab a AGRE AN
(1) BAe TUNAY page Sami AIH LIL $y Ai list EASE STL FEE cone SAE WA
free_area. WIRE (E)H Ht 8 count 4 0.
(2) 48d. iE PAM__alloc_pages( )H%__get_free_page( )M Mt 422A B\Fisp SPAN fe HH. FEE
ARP ACI AAD Re count Bn 1, H¢ page Mets Ai Hy BA Fi] Lise £5 #4 Ml) de RAPA
@) RBA. HTM page BARA VUELTA rw BEA TRE ILIA active list, 3288
DAMA in) TACT AT, AY A TE CE, AE A
AYR SK count Hi 1.
FGGERAR ASCE). GLIA page SURAT OWA ira HAAR “UE”
inactive dirty list, (4 LW E7S 4 (CARLES TE RON AT. AE I TT AOE
Wf AE HC EAT A T+ 8K count I 1.
(8) HORTSBR “AR” SIG AP HAZE ER A, SPAFDCHTUY page BREE MARIERE “IE” TUK
Bl inactive_dirty_list BMAP AR “TIP” MARE.
TADURACHID). TUT page MARSA MILL TISAI Irv BEARS ER TH” $8
TRBASY, SS HRT EIR A A AARR “FTP” GUOBA FI inactive_clean_list.
(7) BAR AERA ASHSWEARASE AAS — BR AICTE, USCS AP REO
(8) SARE, BM TUR” TPAC HT, BRISA, PAR TSH
SR, LMR RE DI oe,
HS SIRS, TE page BUREN BERT AFAR ARS, JRE ERT Ota
active_list #l inactive_dirty_list 4S LRU RAFI), ECE E+ 1h DLR ch ek HT —4 inactive_clean_list.
FEET AY page 7MI7EIRAE LRU PAY UAE, MRT DA ROAR A PCT A Ai DAR AT PES
PUTAS. FIN, LLM FRMY address. space MEdH EH swapper_space, HIATT
WATE IES Ae, FRAN Re AAI page MIRA AIR IDA SA ise BEA JP ASS BA
Bl, Hh, ADR ERM MRR, MILT —T ARR page_hash table.
UPAR GE ARS — 4 AE ASU. WER RA PORE
AERA fe HIRT LE » MLAB et add_to_swap_cache( Y#FSE page AMIHEA AA AIA, J&P BEATE
* mmiswap_statec Ps
a
©
54 void add_to_swap_cache (struct page page, swo entry t entry)
5
56 unsigned long flags;
ST
58 ifdef SWAP CACHE TNFO
59 swap cache add total !t;
6 endif
él if (Pagelocked (page))
73.Wes
Linux 1 wD cea
62 BUG( ) ;
63 if (PageTestandSetSwapCache (page) )
64 BUG);
65 if (page->napping)
66 BUG( );
7 Flags = page->flags & ~((1 << PG error) | (1 << PG_arch_1)
68 page->flags = flags | (1 << PG uptodate) ;
69 add_to_page_cache locked (page, &swapper_space, entry. val) ;
m0 3
CARRIE AMM BL, UR. Bly RIS MEHTA, $C PG_swap_cache
PREM UAA 0, HET mapping OG 0. MIA, RMA ARLMI MRR REAM, BAR SE Lot
H—S, AFUAIE PG_uptodate tra 12 SK 1. HA__add_to_page_cache( )((V% XK, mmvfilemap.c:
416 /*
ATT * Add a page to the inode page cache.
478
419 * The caller must have locked the page and
480 set all the page flags correctly.
4810 #/
482 void add_to_page_cache_locked (struct nage * page,
struct address, space *mapping, unsigned long index)
4830
484 if (1PageLocked (page) )
485, BUC);
486
487 page_cache_get (page) ;
488 spin_lock (&pagecache_lock) ;
489 page->index = index;
490 add_page_to_inode_queue (napping, page)
491 add_page_to hash queue(page, page_hash (mapping, index);
492 Iru_cache_add (page) :
493 spin_unlock (kpagecache_lock) ;
44}
iW BEA mapping BA address_space MHA, LLswapper_spaces RAPMAKA HIE XI
include/linux/fs.h:
365 struct address_space (
366 struct list_head clean_pages; —/* list of clean pages #/
367 struct list_head dirty_pages; /* list of dirty pages #/
368 struct List_head locked_pages; /* list of locked pages */
369 unsigned long nrpages; /* number of total pages */
370 struct address_space_operations *a_ops; /* methods */
ail struct inode #host; /* owner: inode, block device */
372 struct vmarea_struct *i_mmap; —/* list of private mappings */
373 struct vmarea_struct *i_mmap_shared; /* list of shared mappings ¥*/
74.RIS A
374 spinlock_t i shared_lock; /* and spinlock protecting it */
375s
SPALL, WAALS TP" A AE A CEG), Fh BAT
locked_pages iF ie BE Bi ex 40k saz ZE Wy 77 AS iL de lh OA We dtl BCH AD swapper_space (Ky 32 XRT
‘min/swap_state.c:
31 struct address space swapper_space = {
32 LIST HEAD. INIT (swapper_space. clean_pages),
33 LIST HEAD_INTT (swappor_space. dirty pages),
a4 LIST HEAD_INIT(swapper_space. Locked_pages),
35 0, /* orpages */
36 &swap_aops,
a7:
BMPR — TRE REE swap_aops, HILT AA swap Hep ATLA MEET.
JAs8 8 add_to_page_cache_locked( PFT AFH], TH page MAH = TOSI. PHRASEA,
page SHIH MICELI: list REAPER swapper_space, iMH##t next_hash AXLE pprev_hash
BARAORDT, JEL W IS Ira A LRU BAI active_lists
{X34 page_cache_get( )7E pagemap.h 472 XH get_page(page), Ilr LA AH RM MARAT a
page->count Hi 1. JX 424 include/linux/mm.h #15 LEA:
150 define get_page(p) __atomic_ine(&(p)->eount)
31 #define page_cache_get (x) get_page (x)
‘e549 (09 page SAMITLML add_page_to_inode_quene( jin AS] swapper_space :|'tY clean_pages BL,
SEAREIE include/linux/pagemap.h *P:
72 static inline void add_page_to_inode_queue (struct address_space *napping,
struct page * page)
Bf
m4 struct list_head *head = &mapping->clean_nages:
5
16 mapping-Dnrpages t+;
1 List_add(page>list, head) ;
B page->napping = mapping:
9}
TTL, BEAME swapper_space '['(H clean_pages PAF], RUA AC LG Uk A YTB MAE “Te”
RB. {| 4X4 HH add_page_to_inode_queue WL? IAEA WMATA A Wy HATTA,
SCAPEVE / SAL SEAL BURL. ADR AR] 4c Pen wR ++ address_space BEM
ORES, TAR BES ANIC HEHY inode HGRA HIH AI ulS> i_data, ABMAE -7S address_space SUBMIHY.
NORRIE ABUT 2 He IUIATEA] address_space SUES swapper_space 5 kt MEL.
fait __add_page_to_hash_quene( #5FLHEA FIRE AEB, HORA IEZE mm/filemap.c A:
+75.Tes.
psa
Linux A BSACPH ot
58 static void add_page_to_hash_queue (struct page * page, struct page **p)
59 {
60 struct page *next = *p;
61
62 p= page;
63 page->next_hash = next:
64 page->pprev_hash = pi
65 if (next)
66 next->pprev_hash = &page->next_hash;
67 if (page->buffers)
68 PAGE_BUG (page)
69 atomic_ine (&page_cache_size) ;
7}
BEA RU ASD SUOR TARR AL
define page_hash (mapping, index) \
(page hash table + page hashfn (mapping, index) )
SVAN TAY page HUE Mill Iru_cache_add( )HA BI ALE P EN LRU 6) 3] active_list PR, 24S
# mmAwape #:
2260 fae
227° * Iru cache add: add a page to the page lists
228 * @page: the page to add
2900 #/
230 void Iru_eache_add(struct page * page)
a1
232 spin_lock (dpagemap_Lru_lock) ;
233 if (PageLocked (page))
234 BUG( );
35 DEBUG_ADD_PAGE
236 add_page_to_active_list (page) ;
237 /& This should be relatively rare */
238, if (1page->age)
239 deactivate_page_nolock (page) :
240 spin unlock (tpagemap Iru_lock) ;
m1}
J FAMY add_page_to_active_list( )A& S226. 5X T° include/linux/swap.h (Al:
209 ‘define add_page_to_active_list (page) { \
210 DEBUG_ADD_PAGE \
aut TRRO_PAGE BUG \,
212 SetPageActive (nage): \
213 list_add(&(page)—>Iru, &active list): \
214 hr_active_pages++; \
-76-B2E Cm
25}
INT page Bczh24 #9 AT LOB SESE M4 SEAS ru AAI LRU BS. LGR EA
PG_active, PG_inactive_dimty UA PG_inactive_clean JHp.G(RAHW AML SOP. Wsie
BAER BY ME MG UE
TRAE CE ROOR, JEP RULRRT DLE LS Sah TeROETRE, RTLLL NEE
PURE FCA AA 77 By eR PE OL AR ET FAD map yb CR BE AL
fal, SRN RAL SEAL, ESTA / PALIT TERR, SKB ARIAT swapont )Al
swapofi)= S/R:
swapon(const char path, int swapflags)
swapoff (const char path)
JWI RSE tH AE ALP REP EE, A RRA RS a BA SE TY
BARRE. STAT AT ARIK USC AAS FP SRR ie A 8 A RS
SR. ESR, AAR. ite “AGL” AS, AANA Flash Memory (INTE) KAR
BRET, Xf Flash Memory WERE ARAMA, AER PMARLAL, Raa SA,
HAUTE RR (SMR S AEH). SESR, Flash Memory LAGE AM fE DLAI BEEN, TLL HE
PRE AE RAT. SE, 4 Linux ABUT i), APTA TUTE A AR
WER CM ED Tetche dies tS Le, MAT At STZ BLL RAEI swapoa( )
AUN SCALREFE swapon. 2 SAB AX dr OAT MAE HE SBE OER FS
Jot, BALSA ASA RAIA, ASTRA ARN. AFR LARA THA
SERDAR), MRL MASE AEE — SP ATT
2.7 WER aya
ESBS), SRR TA Ret, AS DMA MAT RMR. HSE,
ARLE, HP ETT SETA) “OK” BERANE, RGAE DMA. AYA FSC tt
Bese.
4-SEBEA RAPES HWM, FPL alloc_pages( KIM. Linux PIE 2.4.0 Hie
RESP FE alloc_pages( ), --7S4E mm/numa.c *B, 9} -4S4E mnv/page_alloc.c P, HREM ARIA TE
SL ae PE EEF CONFIG_DISCONTIGMEM #E I. ATA? LORRAL Hay TR
ABARAT] “HUG” BEN AE.
FERRET NUMA 289/15 alloc_pages(), 340047F mm/numac :
43 -#ifdef CONFIG_DISCONTIGMEM
g/m
{2 This can be refined. Currently, tries to do round robin, instead
$3 + should do concentratic circle search, starting from current node.
“77.Linux Hitt HD
94 #/
95 struct page * alloc pages (int gfp_mask, unsigned long order)
96
97 struct page #ret = 0;
98 pe_datat *start, *temp:
99 ifndef CONPIG_NUMA
100 unsigned long flags;
101 static pg_data t next
102 endif
103
104 if (order >= MAX_ORDER)
105 return NULL:
106 -#ifdef CONFIG_NUMA
107 tomp = NODE_DATA(numa_node_id( )):
108 Helse
109 spin_lock irqsave(&node_lock, flags) ;
110 if (Inext) next = pgdat.
in temp = next;
112 next = next->node_next;
113 spin_unlock_irarestore (node lock, flags);
114 Hendif
us start = temp;
116 while (temp) (
uy if ((ret = alloc_pages_pgdat (temp, gfp_mask, order)))
us return (ret) ;
19 ‘temp = temp->node_next;
120 1
1a temp = pedat_list;
122 while (temp != start) {
123 if (ret = alloc_pages_pgdat (temp, gfp mask, order)))
124 return (ret) ;
15 temp = temp->node_next:
126 )
127 return(0) ;
128)
FRA» AE NUMA. iS SRF 2 Le ane EE OR RA. TL BAR A HE AT
CONFIG_DISCONTIGMEM 43 MA #9 BIE. iL. GALA IE RAHI AL “ANE Gee Al”,
TIAE CONFIG_NUMA, 3E9:, EASE EF HIE EAP SO NUMA, 16 BS Ae
‘ak ALR Ok LAR AE BS SEA, TTA SE PT) A A © PA ASE i
AD RE BAS) 2 HAP EE CALA TY TT. TDL, AAR RETA
ELMAR, BMRB ATAT ME, AMAT pe_datat SHRM TA
WAN AB TSH, BEM ofp_mask EMER, RR RAB MS ACMS, A — 4A onder
ROTTER ID, TIDGE 1 2s 4s ey ER) NOOR ATT
7E NUMA 4641 RR, ATLL LPF NUMA_DATA il numa_node_id( )#22 CPU PRET A)
pg_datat MARSH. MAARES UMA BP, MUA pg_data_t BE i HIMBA SI pedat_tist,
78.RIF Hon
SESE AT RTT, CAH Ta
PMP EAB EEF PST while TR, CAIN GERM temp FEEDS AI, ASTI
DB ASB AID) ROT ATTA, BLEEP ASTRO, AEC RIK
TEGO, 3 F455 A, FH alloc_pages_pgdat( SARA ALA TATUM, 4 RCS TE mm/numa.c
He
85 static struct page * alloc_pages_pgdat (pg data t *pgdat, int gfp_mask,
86 unsigned long order)
art
88 return — alloc_pages(pgdat->node_sonelists + gfp mask, order) ;
a9
TYR, Be gfp_mask 261K SF PERS ET PA node_zonelistsf JAI Fn, Bese FLAY STAC R
ESRB S PA THETA UMA S590 alloc_pages( PAR —F, AEFTDIB MEDC, ZEXEMEAE TE]
UMA S40 RA —75 A contig_page_data,jij7E NUMA 28 Hye AES Ia) UMA SiH HUA Bh
eMt[a] UMA 48#509 alloc_pages( E7EX{F mm/page_alloc.c PH NH:
M3. -Rifndef CONFIG DISCONTIGNEM
344 static inline struct page * alloc pages (int gfp_mask, unsigned long order)
Mi
348 ie
MT * Gets optimized away by the compiler.
248 */
349 if (order >= MAX_ORDER)
350 return NULL;
351 return __alloc_pages(contig_page_data. node_zonelists+(gfp mask),
order) ;
352}
15 NUMA Sti f4ift! alloc_pages( AHIR, 3&4SH68AX1E CONFIG_DISCONTIGMEM Zi X81 4 #421
BE. LN ABR AP EE
SUBMIT TE APAL cb es 8__alloc_pages( )5GAk, HACHSTE mm/page_alloc.c "F, HLTA EB:
[alloc_pages( ) > __alloc_pages( )]
2m /®
271 * This is the "heart’ of the zoned buddy allocator:
272 */
273 struct page * __alloc_pages (zonelist_t *zonelist, unsigned long order)
274
215 one_t **z0ne;
6 int direct_reclaim
a unsigned int gfp_mask = zonelist->gfp_mask;
278 struct page * page:
279Linux AV d
280 Ms
281 * Allocations put pressure on the VM subsystem,
282, a/
283 memory pressurett;
284
285 i
286 * (If anyone calls gfp from interrupts nonatomically then it
287 * will sooner or later tripped up by a schedule( ).)
288 *
289 * We are falling back to lower-level zones if allocation
290 * in a higher zone fails.
291 */
292
293 i
294 % Can we take pages directly from the inactive clean
295 + List?
296 4/
297 if (order == 0 && (gfp_mask & __GEP_WALT) &
238 {(current->flags & PF_MEMALLOC))
299) direct reclaim = 1;
300
301 iy
302 % If we are about to get low on free pages and we also have
303 % an inactive page shortage, wake up kswapd.
304 +/
305 if (inactive_shortage() > inactive target / 2 && free_shortage( ))
306 wakeup_kswapd(0) ;
307 is
308 * If we are about to get low on free pages and cleaning
309 * the inactive dirty pages would fix the situation,
310 % wake up bdflush.
3 */
312 else if (free shortage() && nr_inactive dirty pages > free_shortage( )
313 && nr_inactive_dirty_pages >> freepages. high)
314 wakeup_bdflush (
315
BHNAATSE. B-PSR zonelist HCY -~PHRSP ERRAND zonelist_ BURA.
SH order Ml ‘iii thi alloc_pages( )PMVAUIF]. 4/4 memory_pressure RRA # KSLA
Ay, GSP FE PUI, JPLARIN MUR. 208 (FPL gfp_mask 3 E118 ZC FLAS
KH. 2-KATEMH ONS, MRERGMH ABET, ABSA, LAA
RAHM, RHE aBR direct_reclaim Hk 1, FavAT MAA OD PE HY “ATP”
SOU AP EM. RANMA AROS WE MRE SSO H, RAR TMA, He
RERBEITMHASHARAMNREREHEA, CRASARDRRH, MARKERS T.
ey ASR HAE AN — se BE RE TT PE, ER FS BEA,
SET Pehe, Heoh, 4A PC TAGE. ZB EEWRAM kewapd Ail bdflush PA SARA. ike
+80.B28 fwow
ee SEA TUR COFIL “TUTTE TAR”). ERE PA:
{alloc_pages( )> __alloc_pages( )]
316 try_again:
ait ix
318 * First, see if we have any zones with lots of free menory.
319 *
320 %* We allocate free menory first because it doesn’ t contain
321 * any data ... DUH!
322 */
323 zone = zonelist->zones;
324 for {
325 zone t #2 = *(zone++);
325 if (12)
327 break:
328 if (size)
329 BUG );
330
331 if (@>free_pages >= z~>pages_low) {
332 page = rmqueue(2, order);
338 if (page)
334 return page:
235 } else if (2->free_pages < 2—>pages_min &&
336 waitqueue_active(@kreclaimd_wait)) {
337 wake_up interruptible(ékreclaind wait) ;
338 1
339) }
340
GE RERE SPILT FAI AL TAT HR PEP DE DH AS RR TR PA
HSS, WRB Ze “RA” OL, BEAL rmqueue( AVIRA RR. BEL
EATS REE TR, AAT MERE CSch LE REA RAY kreclaimd) 7£—MES BAT
kreclaimd_wait FRE, SHE CME, ULE BDI TC aR. BAK rmqueue( RFA — PTE
ERR PEROAG HIM, FLA mm/page_alloc.c P:
[alloc_pages( ) > __alloc_pages( ) > rmqueuet )]
172 static struct page * rmqueue (zone t ¥zone, unsigned long order)
mB
174 free area t * area = zone->free area + order;
175 unsigned long curr order = order:
176 struct list head #head, #curr;
it unsigned long flags;
178 struct page *page
79
180 spin_lock_irqsave (zone->lock, flags) ;
“81.re
Seas
Linu Aa kD
181 do {
182 head = &area~>free_list:
183 curr = momlist_next (head) ;
184
185 if (curr != head) {
186 unsigned int index;
187
188 page = menlist_entry(curr, struct page, list);
189 if (BAD_RANGE (zone, page))
190 BUGC):
191 men] ist_del (curr) ;
192 index = (page ~ men map) ~ zone~offset;
193 MARK USED(index, curr_order, area);
194 zone->free pages = 1 << order;
195
196 page = expand(zone, page, index, order, curr_order, area);
197 spin_unlock_irqrestore(&zono—>lock, flags) ;
198
199 set_page_count (page, 1);
200 if (BAD_RANGE (zone, page))
201 BUG);
202 DEBUG_ADD_PAGE
203 return page;
204 }
205 ourr_order++;
206 reat;
207 } while (curr order ¢ MAX_ORDER) ;
208 spin_unlock_irgrestore(&zone->lock, flags) ;
209
210 return NULL:
au)
CLAD, ACROPELTE ATHY page RAREAH, LLL ALBERT CER AR EIA. 3}
ROTM SASH ASU, MARRERO HMM RE SOR EC ATA
ARATHRAY. BFLLEAT spin_lock_irqsuve( HAIMA MEO AAUHTE. WER MHA ER
zone->free_area JEMVRPEUR, BAU one->free_area +order WHET] SERIA A DEEN TFIIB
Ske SEE BEERR TEAS do_while MRTMET. ENA AISMER A BRM TLE SAL, UAE
COBRA ANY RMSE ALES) BUSI SPA, RLODAOLE, RAEN AIT ACHP AR
TPE VERT CBRL 196 ATH expand()).
38 188 17 40H memlist_entry( )A—7P3RE AFIS -/MEMY page 70K, AML tt memlist_del()
IUMBUTIPINER. Auk, SUES LRU RL AOR.
KL expand( EA FI—I fF (mm/page_alloc.c) PE LY:
[alloc_pages( ) > __alloc_pages( ) > rmqueue( )> expand )]
150 static inline struct page * expand (zone t #zone, struct page *page,
8.Ble wae
151 unsigned long index, int low, int high, free area_t * area)
12
153 unsigned long size = 1 < high:
154
155 while (high > low) {
156 if (BAD_RANGE (zone, page))
157 BUG():
158
159
160
161 meml ist_add_head(&(page) list, &(area) >free list) ;
162 MARK_USED (index, high, area);
163 index += size;
164 page += size;
165 }
166 if (BAD_RANGE (zone, page) )
167 BUG( );
168 return pages
169}
SHRP BY low REL FAA AT EEA st order, if high JUL THe He EPA WA SIC
SEH E AG LER ARAYA SI) EN curr_order. SPRAAAEM, MA 15S 47 FF RGHY while HF
BLT. PRA RBIAWVERK TATA AD CARAT BEN TRENT A), ARRAS UR EE
ARRON EMO TRO SUP 2, FAIRS ABA DIME, BOLAEM 158 1725
162AT PSEA. PGR Wid SE, THUS ERE AD RAISER CB 163 FH 164 47),
FBT AMAR PSNR TI. TE, BUT high 5 low WAI, th
BURST RG BER PAL REIN Mee, ERR A
BUX, rmqueuet )— EU LAS. HST A BEAM. HR rmqueuet ) KW, h|__ulloc_pages( )
HELLK for TREMOR, BRR TOLER F—“METE, ABI RSD, SAME T SHH NE
SEKI CML 327 17). MRS ACRLD TT, I)__alloc_pages( )EFI—+ page AIH, HMM
PITA page S444), Jt Li& page MAE AHH count 4 1. WARK IATA AE PAY TUM order
WO), NUE SED TY EARS 1
BLES A OE PIT ALY TT IT, ORR Ae” ARR, A RACER 9
BER POE CORLL” AUSER, LARP AE TT ALC IN “ANGER TS Mh” APE. FUE
FH __alloc_pages( )f3{RE3 (mm/page_alloc.c).
[alloc_pages( ) > __alloc_pages( )]
34 x
32 % Try to allocate a page from s zone with a HIGH
343 % amount of free + inactive_clean pages.
a *
345 * If there is a lot of activity, inactive target
346 * will be high and we'll have a good chance of
MT * finding a page using the HIGH limit.
+8.Nes.
Linux pa HESmt ft a> ee
348 */
349 page = __alloc_pages_limit(zonelist, order, PAGES HIGH, direct_reclaim) ;
350 if (page)
351 return page:
352
353 be
354 * Then try to allocate a page from a zone with more
355 % than zone->pages low free + inactive clean pages.
356 *
367 * When the working sel is very large and YM activity
358 * is low, we're most likely to have our allocation
359 * succeed here,
360 */
361 page = alloc_pages limit (zonelist, order, PAGES_LOW, direct_reciaim):
362 if (page)
363 return pases
364
3& HELL BBW PAGES_HIGH iff Hi__alloc_pages_limit( ); MRIDAAT RB MA DME. BU
PAGES_LOW FHial]—Yk. HML__ alloc_pages_limit( )(#/{t051HZE mm/page_alloc.c "P+
{alloc_pages( ) > __alloc_pages( ) > __alloc_pages_limit()]
213. define PAGES_MIN 0
214 Bdefine PAGE 1
215 #define PAGES HIGH = 2
216
27 fe
218 * This function does the dirty work for __alloc_pages
219 * and is separated out to keep the code size smaller.
220 * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
221 */
222 static struct page * __alloc_pages limit (zonelist_t *zonelist,
223 unsigned long order, int limit, int direct_reclaim)
224
225 zone_t #*zone = zonelist->zones;
26
227 for (3) L
228 zone_t 2 = *(zone++) :
229 unsigned long water_mark;
230
231 if (a)
282 break:
233 if (2->size)
234 BUG);
235
2360 fx
84.BIE foes
We allocate if the number of free + inactive clean
* pages is above the watermark.
switch (limit) {
default:
case PAGES_MIN:
water_mark = 2->pages_min;
break;
case PAGES _LOW:
water_mark = 2->pages_low;
break;
case PAGES_HIGH
water_mark ~ z~>pages high;
if (2->free pages + z—>inactive_clean_pages > water_mark) {
struct page *page = NULL;
/* Tf possible, reclaim a page directly. */
if (direct reclaim && z->free pages < 2~>pages min + 8)
page = reclaim page(z)
* Tf that fails, fall back to tmqueue. */
iP paged
page = rmqueue(z, order)
if (page)
return pages
h
}
/* Found nothing. */
return NULL;
}
3 2604 (CE ABL__alloc_pages( )"P0 for (HEK(ESBAE KL SLASHNA ANIL, ARACEAE.
FE reclaim_page( )M Til @ #2 64 inactive_clean_list BAF bie Hil, HARESEE mm/vmscanc HH
AeA de TET RU” A, UR Oy Le) T tl AR LOS A
ESWHA TAMA LSM direct_reclaim 4E 0, HLA IAM EAS
SBA, TAR WLM AER f(t ML AOE FILTRATE A__alloc_pages( Btn
HAM:
{alloc_pages( )> __alloc_pages( )]
365 ie
366 * OK, none of the zones on our zonclist has lots
367 * of pages free.
368 *
369) %* We wake up kswapd, in the hope that kswapd will
370 * resolve this situation before menory gets tight.
83.Linux 1 401 fi
an *
372 * We also yield the CPU, because that:
373 * ~ gives kswapd @ chance to do something
374 % > slons down allocations, in particular the
375 % allocations from the fast allocator that’ s
376 %* causing the problems ...
3m7 %*-... which minimises the impact the “bad guys”
378 * have on the rest of the system
379 * if we don't have GFP 10 set, kswapd may be
380 * able to free some memory we can’ t free ourselves
381 wv
382 wekeup_kswapd (0) ;
383 if (gfp_mask & __GRP WAIT) {
384 ___Set_current_ state (TASK_RUNNING) ;
385, current->poliey |= SCIED_ YIELD;
386 schedule( );
387 I
388
389 o
390 * After waking up kswapd, we try to allocate a page
391 * from any zone which isn’t critical yet.
392 *
393, * Kowapd should, in most situations, bring the situation
394 * back to normal in no time.
395 v
396, page = __alloc_pages_limit(zonelist, order, PAGES MIN, direct_reclaim) ;
397 if (page)
398 return page:
399
‘PAARL ERARAZ kswapd, UES VA ih a SRM LL AM TI
BELA DMA MITT AS, RAL RGR YOM, FEELS UREA SOIRUERLE— PRR. GRE,
RIE kowapd # FY E37 BN HEI RSE ase a te AF 1 Fe SP
RAPE TUMORAL, WET IE I. RAPA TT EA AGATE, BR SA ERR TA ANS
YESAERY, BELLS PAGES_MIN #818 -2%__alloc_pages_limit( ). WHE, BUSTER? Lente
RHREAERA RAEI F . WRIA RRO ME RE RR AEE kswapd BY kreclaimd, AH RLE
“RES LES", ERICA EM AI RIMT AS, BSE HT, IR AR
OME FEMALE. RAGE ABA task_struct 44)!" fags FBLMY PE_MEMALLOC iat 1. RUT AT
TfGEFE, Bll PF_MEMALLOC tii (0.) 0 AUREREAUAL SR.
[alloc_pages( ) > __alloc_pages( )]
400 ix
401 * Damn, we didn’ t succeed.
402 *
403 * This can be due to 2 reasons
86.B25 fn —
404 *— we're doing a higher-order allocation
405; %* —) move pages to the free List until we succeed
406 *— we're /reaily/ tight on memory
407 —> wait on the ksvapd waitqueue until memory is freod
408 */
409 if (!(current->flags & PF MEMALLOC)) {
410 *
4n % Are we dealing with a higher order allocation?
412 *
413 % Move pages from the inactive clean to the free list
44 ¥ in the hope of creating a large, physically contiguous
415 * piece of free memory.
416 */
a7 if (order > 0 && (gfp_mask & GFP WAIT) (
418 zone = xonelist->zones;
419 /* First, clean some dirty pages. */
420 current->flags |= PF_MEMALLOC;
ai page_launder(gfp_mask, 1) ;
422 current=>flags &= ~PR_MBMALLOC;
423 for Gi) {
424 zone_t *2 = #(zone++);
425, if (12)
426 break:
427 if (1z>size)
28 continue:
429 while (z->inactive_clean_pages) [
430 struct page * page:
431 /* Move one page to the free list. #/
42 page = reclaim page(z);
433 if (page)
ut break;
495 __free_page (nage) ;
136 /* Try if the allocation succeeds. */
47 page = rmqueue(, ord
438 if (page)
439 return page:
40 :
4Al }
2 )
443 i
aah * When we arrive here, we are really tight on memory.
445 *
6 * We wake up kswapd and sleep until kswapd wakes us
war * up again, After that we loop back to the start.
448 *
499 % We have to do this because something else might eat
450 * the menory kswapd frees for us and ve need to be
151 * reliable. Note that we don’t loop back for higher
87.re
Linus BABAR bat) Ess anes
492 * order allocations since it is possible that kswand
A583 * simply cannot free a large enough contiguous area
164 * of memory *ever*,
455 */
456 if ((gtp_mask & (__GFP WAIT GRP 10)) == (__GRP_WAIT|__GFP_10)) {
457 wakeup_kswapd (1) ;
458 nenory_pressuret*;
459 if (lorder)
460 goto try again:
461 i
462 * If __GFP 10 isn’t set, we can’1 wait on kswapd because
263 * kswapd just might need some 10 locks /we/ are holding ...
464 *
465 * SUBTLE: The schedul ing point above makes sure that
466 * kswapd does get the chance to free memory we can’ t
467 * free ourselves.
468 */
469 } else if (gfp_mask & __GFP_WAIT) [
410 try_to_free_pages (gfp_mask) :
471 memory_pressure++;
472 if (order)
473 goto try_again:
‘v4 t
475
416 }
a7
RCA 4 SU RATT RE LH, Ake RE SA GRAD TF
MABRNGERD, EROTIK DMA, HE Rb A RM
inactive_clean_pages PARP. WRAL CAAT AER AAT TT. FAY, WTR ATMS BE”
TURES eH) inactive_dirty_pages BL, CALS MINAS Pl ae bate, eT Lee
SER AE" TOTTI. TEA, FLAT BE, {RESP RB IT page_launder( HE “HET” TH “YL
i" ERR “TURE IB”), AaB — AP flor (ER HE A cl ME HD EB” IE.
ALR ATALACAUFR BCL ot —~4* while (TENN. TE __free_page( EHH SIESTA BS
RATE AM TU, BTLMERPML T —4 i OUT ABE rmqueue( ik —F, BGA CARI
OR (AERA, 3-4 9A page_taunder( )!ilnl@ “AT HEFERY PE_MEMALLOC fide(tz Hea 1,
RAT RUT AR” MAR, Watt BETA AE? 1 ET VE page_launder( )' 12 BOR PAR AEG
WHEOU LAE el, AME PF_MEMALLOC $y ficient 1 SER] SERRE Ai BAY 409~476 47.
WURDE TPE Hi td LE AAMT, ABR AL] SPR TU TRA A RB ADEE ALORA
kswapd. (32 -R SPC TL Oi PEW BEER FF, HEL kswapd ZETERR T -HG3E TZ a FEAL KER SPD
THM. i) MRA RACED SO. Bt goto F414 MI__alloc_pages( FF IAbH ES
try_again Mb. 5} — ARAL ALU try_to_free_pages( ), iA“ AMAL JE TH kswapd TAHA.
BA, WIRE “WTAG” WE? a, FURR RRUT AI, CAVE Sw, ART UE
Hi, SUA EAR AERA TE A APTA AS try_again Mb.
+88.92% moa
BOER ESI, YC OT EWE AL__alloc_pages_limit( ist Se LIGA PAPTAR ATEN. i, HE
A—UKDL PAGES_MIN Yo -58, HIN) PU LE ay SM RE A aT SLY OKC”
popages_mine ZPADABRE A EA, ALU MUR SARDL, TUDE LRT “RA” MUTT.
RATALE(E F Fi__alloc_pages( ){Hfti.
[alloc_pages( ) > __alloc_pages( )]
478 x
49 * Final phase: allocate anything we can!
480 *
481 % Highor order allocations, GFP_ATOMIC allocations and
492 % recursive allocations (PF MEMALLOC) end up here
483 *
44 * Only recursive allocations can use the very last pages
485 % in the system, otherwise it would be just too easy to
486 * deadlock the system
481 /
488 zone = zonelist->zones;
489 for (3) {
490 zone_t 2 = *(zone++);
491 struct page * page ~ NULL:
492 if (12)
498 break;
494 if (zsize)
195 BUG);
496
497 *
498 * SUBTLE: direct_reclaim is only possible if the task
499 % becomes PF_MFMALI.OC while looping above. ‘This will
500 * happen when the OOM killer selects this task for
50L * instant execution.
502 */
503 if Girect_rectaim) |
504 page - reclaim_page (2)
505 if (page)
506 return page;
507 }
508
509 /# XXX: is pages.min/4 a good amount to reserve for this? */
510 if (>free pages < 2~>pages_min / 4 &&
sui Ucurrent->flags & PF_MEMALLOC) )
512 continue:
513 page = rmqueue(2, order) :
54 if (page)
515 return page:
516 }
517
89.Linux HAA a)
518 /#* No luck. */
519 printk(KERN_ERR “__alloc_pages: ‘lu-order allocation failed. \n”, order);
520 return NULL;
bal}
DREAM, RATT T
BES IFA BL: ARIK, GR NBULD A MIHAEARRL, B CPU A EDS O TRE
SUB RY YEE 2 TE RA ABLE PAL SJ) “KC” TS RE”, AR
ZEEE H © SR by AAS ACES RTT BT Bi PE A AE CE SE BS HS
SERGARIA To ASL) MGR RATS RT LAB Bet — 4S RSH SLB
28 Riaz Hkh
JEM AEC BUS, REE RT.
Pr TER LEE CPU ARRIETA, ERE CERT SE ALE AUR, SPR RAS oy a PAE
FUDFHN ARH, Linux AME BP ei, BE, DR ERI
SEABT EY GAR. AR, aT eam SAAN AAD, RII a a A ee ET
REM ATR SAR, TR AFIS SHR ATR He TRAY OTE. AR. ORECAST Lb Ste AA
WR. HL, MSR, PRS AMIN Ok, UGA S MM, BTCA AER TE
REY i A SIA eH RISB FA ES AE Lima A Be ERB he NE
APT AY “SH” kswapd.
MGIB EGR, kswapd FP PAE, ATC A UEP task_struct 4, FREER RE
ZAR COUMAE, TIAA AN BOC UIE, BMT WUC TRAD RE. RLS
GRAB METEHIG, kswapd ELTA. BE, CRA A CMCC bk i, PTDL CE URE
REICH A “BH” (thread) Liab]. AIA, kswapd (EAL HEROS IME? CEH AE A ROS
FA. FER, ESTAR REEL. Hk, CA MIE ere PM, ATLL
ARPS PRET, TAMU REL RSTO. (ARUGULA.
AHURA kowapd 2A BM AETIIS47 tk se RAT LAE.
SeHE kswapd HWA AR LAE mmvvmscan.c PFET E AVE
1146 static int __init kswapd_init (void)
way
i148 printk("Starting kswapd v1. 8\n") ;
1149 swap_setup( };
1150 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILRS | CLONE_SIGNAL);
151 kernel_thread(kreclaimd, NULL, CLONEPS | CLONE_PILPS | CLONE SIGNAL) :
152 return 0;
53}
FRR kswapd_ini( LE RAPA CHIESA, CRAMP. SB AEECE swap_setp( )
PLA DE AN FRA RE — Fa page_cluster:
0.2S eres
{kswapdl_init( ) > swap_setup( )]
2930 fe
294 * Perform any setup for the swap system
295 of
void __init swap_setup (void)
296
297
298 /* Use a smaller cluster for memory <16MB or <32MB #/
299 if (num _physpages < ((16 * 1024 * 1024) >> PAGE_SHTFT))
300 page_cluster = 2;
301 else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT))
302 page_cluster = 3;
303 else
304 page cluster = 4;
305
}
CE-RAR EM AAKWSM. HTRMAN ARS Se, IF SRE TCR RATA
SE, PUM RERAR— TRB ATE. LUE INE RET RFS ELT,
“Rik”. GEARRAAEKRRE ESO ATR, ROR MEMMR, TRUE YEE
AER AOKI RHE EAS BLA E SHE . BMRA LOE AAZ kswapd, 3X 2 th kernel_thread( )
FEIN. JA SUB HET PAR kreclaimd, WAM, ALAM kewapd MAR RAL
SE, RCRA BA EAE add. KT AS EA SE, AK SER kewapd
BROERESE SFP HABER kswapd( )FPMAT. SURE ZE mm/vmscan.c 1";
on /*
948 * The background pageout daemon, started as a kernel thread
949 from the init process.
9580
951 * This basically trickles out pages so that we have _some_
952 * free memory available even if there is no other activity
953 * that frees anything up. This is needed for things Like routing
954 * etc, where we otherwise might have all activity going on in
955 * asynchronous contexts that cannot page things out.
9560 €
957 * Tf there are applications that are active memory-al locators
958 * (most normal use), this basically shouldn’ t matter.
959 #/
960 int kswapd(void unused)
get
962 struct task struct #tsk = current;
963
964 tsk->session
965 tsk>pgrp = 1;
968 strepy(tsk->comm, “kswapd”) ;
967 sigfill set @tsk->blocked) ;
968 kswapd_task = tsk:
91.IOC OT
Www.zzbaike.com
Linux A Healt sy sbi tier
969
970 i
97 Tell the memory management that we're a “nenory allocator”,
gr2 * and that if we need more menory we should get access to it
973 + regardless (sce “__alloc_pages( )”). “kswapd” should
gra % never get caught in the normal page freeing logic.
975 *
976 % (Kswapd normally doesn’ t need menory anyway, but sometimes
977 % you need a small amount of memory in order to be able to
978 % page out something else, and this flag essontially protects
979 4 us from recursively trying to free more memory as wel re
980 + trying to free the first piece of memory in the first place)
981 +/
982 tsk->flags |= PF_MEMALLOC:
983
984 i
985 % Kswapd main Loop.
986 a/
987 for Gi)
988 static int recale = 0;
989
990, /* If needed, try to free some memory. */
991 if Cinactive_shortage() || free_shortage( )) {
992 int wait = 0;
993 /* Do we need to do some synchronous Mushing? */
994 if (waitqueve_active (@kswapd_done))
995 wait = 1;
996 do_try_to_free_pages (GHP_KSWAPD, wait);
997 }
998
999 ys
1000 + Do some (very minimal) background scanning. This
1001 ¥ will sean all pages on the active list once
1002 * every minute. This clears old referenced bits
1003 ¥ and moves unused pages to the inactive List.
1004 a/
1005 refill _inactive_scan(6, 0);
1006
1007, /x Once a second, recalculate some YM stats. */
1008 if (vime_after (jiffies, recale + HZ)) {
1009 recale = jiffies:
1010 recalculate vm stats();
101 }
1012
1013 i
1014 * Wake up everybody waiting for free memory
1015 ¥ and unplug the disk queue.
1016 7
9-RIS trae
1017 wake_up all (@kswapd_done) ;
1018 run_task queue(&ta disk) ;
1019
1020 os
1021 * We go to sleep if either the free page shortage
1022 * or the inactive page shortage is gone, We do this
1028, * because:
1024 * 1) we necd no more free pages or
1025 + 2) the inactive pages need to be flushed to disk,
1026, * it wouldn’t help to eat CPU tine now ..
1027 *
1028 * We go to sleep for one second, but if it’s needed
1029 * we' Ll be woken up earlier.
1030 /
1031 if ({free_shortage() || !inactive_shortage()) {
1032 interruptible sleep_on_timeout (@kswapd wait, HZ);
1038, ye
1034 » Lf we couldn't free cnough memory, we see if it was
1035, * due to the system just not having enough memory.
1036 * Lf that is the case, the only solution is to kill
1037 * a process (the alternative is enternal deadlock).
1038 *
1039 ¥ If there still is enough memory around, we just Loop
1040 % and try free some more memory...
1041 ”/
1042 } else if (out_ot_memory( )) {
1043, oomkill():
1044 }
1045 }
1045}
EEE CREO, FLEA EIR. MAR AD
interruptible_sleep_on_timeout( )i# AWEOR, Uc Py 4% 6 HMI RUMEARICT. (UA GEIS TAL IS
LETRAS kswapd BEALE, EMH kewapd BLEIBT. BA KE
AVA” ALS AWE, GABLALATME HZ. HZ dei T ARR EPA & DIR PPP. FEL) LATER EK
HUN REACT ERE AL (ELI ZB BR ARE ROK T FUL, 118 interruptible_sleep_on_timeout( )
RNB HZ. Zen 1 PPL MSE kswapd HIE T. HZ. Hf interruptible_sleep_on_timeout( )
SWEAR 1 PLUG EDK. (Rk, ERR PARA TE 1 SOREL RIERE, S
FE kowapa Bi Cobh i ol DIF AG BiB HE FTL, IX RR cb EI 1 PERT, EOLA kswapd
NAT BER
BA, kswapd ZS > Re — UR AY TE eet | ZU? AT LGBE PRN ABSY OB BRON EE,
RIMARH LAR FART M, AMF RRMA, Eee MUNI,
FEL TARA A BORAS, A TUTTI ERE. RAS AAR AES ATA,
INCE TA CEB bP BORA * TUM SAS BGR SECM ANERR “EUR” TT a
Ah, Bot A FCP ENE OC TT
93.Linux 4 iit i
FAAS, ESSA TEA Ae OL MRR DE a A
[kswapd( ) > inactive_shortage( )]
805 /*
806 * How many inactive pages are we short?
R07 #/
808 int inactive_shortage (void)
soo f
810 int shortage = 0;
811
B12 shortage #= freepages. high;
813 shortage *= inactive target:
sid shortage -= nr_free pages( )
81s shortage = nr_inactive_clean_pages( );
816 shortage ~+ nr_inactive dirty_pages;
ait
818 if (shortage > 0)
819 return shortage:
820
821 return 0;
2)
RP AAS ALT OLIN YS LOT, AGBLAE freepages.high Ml inactive targets 4}
Ba 3 A) AAAS GB TH, AA TEA BT 6 NT A
KERMA STAR. TSA MESA, KASAM RTL SN TU. SALT A LTE
ATER RT LAF ROBHLIER AN 2.4, 2" SL IRTES SH THER, MCE HH nr_free_pages()
ME. BOM AGAR SP" HR, ROTA Le bas Ren i
ENA P MARTELL, OR MATT AB FR RR BEA. RATT
SHES POET, (IRATE, FERC! nr_inactive_clean_pages (IMLMIT» Rela
BUM IRR “ME” TU, RAT OMG", ROG REA OL A MBE ONL. CAAT
AMER TBARED, PREP HY EE or_inactive_dirty_pages 1A MSIL AUT He. LP
AMO RESARZE mmipage_allocc , WALA HM, BTU UR.
Ait, Hee ATT OL ME AME, BAG free_shortage( RXTE BANAT PUREE
EPA PEMC KOR, BPELHETT Pes MeN TUM MR CORANIARK “AE” TURTLE) ARB FRAG.
PBR RBE mmivscanc , LMC MARE.
WRART KTR AE LT, RELA RHA TR, BRB
do_try_to_free_pages( )5ehM. Ait eth MBSA waitqueve_active( ), HF kswapd_done [}5*
REARS THAT, THUR IE BAAEMS do_try_to_free_pages( ). #698 3-4. RF
BRAARPALARMIL, AB RPA CREW Baka) AT LUE Animate AIRE
AYIA, MASA CE CSI AE AR ENT AEA BLAILAT:« i Kswapd_done, BLIEIX#RAY -MIAF. JL
EEA MASIMR ML, Ze kswapd AisEat ROUTE MIRE MT BEAL BIBT. ROE inline wb
waitqueve_active( MEA BA Ai eh BERR PATI EAST. SOE ZE include/linux/wait.h Hs
+94.22 tease
[kswapd() > waitqueue_active( )}
152 static inline int waitqueue_active (wait_queue_head_t *q)
153 [
154 #if WATTQUBUE_DEBUG
155 if (ta)
156 WO_BUG();
157 CHECK_MAGIC_WQHEAD (q) ;
158 endif,
159
160 return !Jist_empty (&q->task_list) ;
Lf
FHQE W do_try_to_free_pages(), iRPAMRH) -22A47 TUM. HRESZE vinscan.c Ps
{kswapd( ) > do_try_to_free_pages( )]
907 static int do_try_to_free_pages (unsigned int gfp_mask, int user)
908
909 int ret = 0;
910
sil I
912 * If we’ re low on free pages, move pages from the
913 * inactive_dirty list to the inactive clean List,
914 *
a5 * Usually bdflush will have pre-cleaned the pages
916 * before we get around to moving them to the other
a7 * list, so this is a relatively cheap operation.
918 */
gig if (free_shortage() || nr_inactive dirty_pages > nr_frec_pages( ) +
920 mr_inactive clean_pages( )}
921 ret + page_launder(gfp_mask, user);
922
923 x
924 * Tf needed, we move pages from the active list
925 * to the inactive list. We also “eat” pages from
926 * the inode and dentry cache whenever we do this.
oer v
928 if (frec_shortage( ) || inactive_shortage( )) {
99 shrink deache_memory (6, gfp_mask) ;
930, shrink_icache memory (6, gfp_mask);
981 ret += refill inactive(erpmask, user) :
932 } else {
933 i
9M * Reclaim unused slab cache memory,
935 */
936 knem cache_reap(gfp_mask) ;
937 ret = 1;
+95.Te Soya
eae Neely
Linu A RARE af ce aa ese
938 }
939
940 return ret:
ou}
HARTMAN, ELMAR, CPUTRM ARMS, AAG OTIZ,
Py ih LP RO — TR A RR ARTE “REE LEN” AAT
PEROMEN), ABIES “WGK DUH THE”. TLL, AE AAY ORR” TEENA. MPR
1B, RUBRICS GME. BEAD DNR AME. WILALIEIE] page launder), HARI OLH A ADR
0 AE" TU “Bee”, PETER OLA. AAA “launder”, BEAL “EAT.” AY
SB. RGM HORA LEME kswapd BK, AERA RECA A, OX
AMAT, BABA. SCARROZE mm/vmscan.c Hs
tkswapd( ) > do_try_to_free_pages( ) > page_launder( )]
465
466 * page_leunder ~ clean dirty inactive pages, move to inactive clean list
467 * @gfp_nask: what operations we are allowed to do
468 * @sync: should we wait synchronously for the cleaning of pages
469 *
470 # When this function is called, we are most likely lov on free +
471 inactive_clean pages. Since we want to refill those pages as
472 * soon as possible, we’ Ll make two loops over the inactive list,
473 one to move the already cleaned pages to the inactive clean lists
474 * and one to (often asynchronously) clean the dirty inactive pages.
415
476 * In situations where kswapd cannot keep up, user processes will
477 * end up calling this function. Since the user process needs to
418 * have a page before it can continue with its allocation, we’ ll
479 * do synchronous page flushing in that case.
4800
481 * This code is heavily inspired by the FreeBSD source code. Thanks
482 * go out to Matthew Dillon,
4830 ¥/
484 define MAX_LAUNDER (4 * (1 <& page eluster))
485 int page_leunder (int gfp_mask, int syne)
486 (
487 int launder_loop, maxsean, cleaned pages, maxlaunder:
488, int can_get_io locks:
489 struct list_head * page_Lru;
490 struct page * page:
491
492 is
493 + We can only grab the 10 Jocks (eg. for flushing dirty
494 + buffers to disk) if __GEP_IO is set.
495 "/RIS Fees
495 can_got_io_locks = gfp_mask & __GFP_I0;
497
198 launder_loop = 0;
499 maxlaunder = 0,
500 cleaned_pages
50L
502 dirty_page_resean:
503 spin_lock (&pagemap_Iru_lock) ;
504 maxscan = nr_inactive dirty pages:
305 while ((page_lru = inactive dirty list. prev) != @inactive dirty list &
506 maxsean— > 0) [
507 page = list_entry(page lru, struct page, Iru);
508
509 /* Wrong page on list?! (list corruption, should not. happen) #/
510 if (!PagelnactiveDirty(page)) {
SL printk("VM: page_launder, wrong page on list. \n");
Biz Tist_del (page_lru) ;
513 ir_inactive dirty_pages—-;
54 page->zone->inactive_dirty_pages-~;
515 continue;
516 }
sit
518 /# Page is or was in use? Move it to the active list. +/
519 if (PageTestandClearReferenced (page) || page->age > 0 ||
520 Cpage->buffers && page count (page) > 1) |!
521 page_ramdisk(page)) {
522 del_page_from_inactive dirty list (page) ;
523 add page to active list (page);
524 continue:
525 }
526
52 i
528 * The page is locked. 10 in progress?
529 * Move it to the back of the List.
530 */
531 if (IryLockPage (page) {
532 list_del (page_Lru) ;
533 List_add(page_Iru, &inactive dirty list);
5a continue;
535 }
536
537 y*
538 + Dirty swap-cache page? Write it out if
539 + last copy..
540 +/
S41 if (PageDirty(page)) {
542 int (#writepage) (struct page *) = page->mapping->a_ops->writepage;
543 int result;
97.Wes
Linux os EF a cena
544
545 if (twritepage)
546 goto page_active:
SAT
548 /* First time through? Move it to the back of the list +/
549 if (!launder. loop) {
550 Jist_del (page_Iru) ;
551 list_add(page_Iru, &inactive dirty_list);
552 UnlockPage (page) ;
553 continue;
554 )
555
558 /* OK, do a physical asynchronous write to swap. */
557 ClearPageDirty (page) :
558 page_cache_get (page) ;
559 spin_unlock (&pagemap_lru_ lock) :
560
561 result = writepage (page) ;
562 page_cache_release (page) ;
563
564 /* And re-start the thing. */
565 spin_lock (&pagemap_lru_lock) ;
566 if (result != 1)
567 continue;
568 /* writepage refused to do anything */
569 set_page dirty (page) ;
570 goto page active;
57 }
572
573 i
ara * Lf the page has buffers, try to free the buffer mappings
505 * associated with this page. If we succeed we either free
516 * the page (in case it vas a buffercache only page) or we
5i7 * move the page to the inactive clean list.
578 *
579) * On the first round, we should free all previously cleaned
580 * buffer pages
581 “/
582 if’ (page->butfers) {
583 int wait, clearedbuf;
584 int freed_page = 0;
585 Ie
586 % Since we might be doing disk 10, we have to
587 % drop the spinlock and take an extra reference
588 %* on the page so it doesn’ t go away from under us.
589 /
590 del_page from inactive dirty_list (page) ;
591 page_cache_get (page)
+98.592.
593,
594
595
596
597
598,
599
600
ot
602
603,
eos
605
606
607
608
609
610
61
612
613
64
615
616
617
618
619
620
eal
622
623
624
25
626
627
628
629
630
61
632.
633,
4
635,
636
637
638
639
SRD fen
spin_unlock (pagenap, Iru_lock) ;
/* Will we do (asynchronous) [0? */
if (launder_loop && maxlaunder == 0 8& sync)
wait /* Synchrounous 10 #/
else if (launder_loop && maxlaunder-- > 0)
wait = 1; /* Asyne 10 ¥/
else
wait
0; No 10 */
/* Try to free the page buffers, */
clearedbuf = try_to_free_buffers (page, wait);
it
%* Re-take the spinlock. Note that we cannot
* unlock the page yet since we! re still
* accessing the page struct here.
*/
spin_lock (&pagemap Iru_lock) ;
/* The buffers were not freed, */
if (lelearedbut) {
add_page_to_inactive_dirty_list (page) ;
/* The page was only in the buffer cache. */
J else if (!page->mapping) {
atomic_dec (&buf fermen_pages) ;
freed_page = 1
cleaned_pages++;
/* The page has more users besides the cache and us. 4/
} else if (page_count (page) > 2) {
add_page_to active List (page) ;
/* OK, we “created” a freeable page. */
} else /* page~>mapping && page count (page)
add_page_to_inactive clean list (page) ;
cleaned pages++;
w/t
1
I
* Unlock the page and drop the extra reference.
%* We can only do it here because we ar accessing
% the page struct above.
*/
UnlockPage (page) ;
page_cache_release (page) :Linux Po Bett ema
640 ys
641 % If we're freeing buffer cache pages, stop when
642 4 we! ve got enough free memory.
643 4/
bad if (freed_page && !free_shortage( ))
645 break:
646 continue;
647 } else if (page->mapping && !PageDirty(page)) {
648 i
649 + Tf a page bad an extra reference in
650 * deactivate _page(), we will find it here.
651 + Now the page is really freeable, so we
652 % move it to the inactive clean list
6m */
654 del page from inactive_dirty list (page) ;
655 add_page_to_inactive clean_list (page) ;
856 UnlockPage (page) ;
657 cleaned_pagest+:
658, | else {
659 page_active:
660 ix
661 % UK, wo don’ { know what to do with the page.
662 * It’s no use keeping it here, so we move it to
663 * the active list.
664 */
665 del_page from_inactive_dirty_list (page) ;
666 add_page_to_active_list (page) ;
667 UnlockPage (page) ;
668 }
669 }
670 spin_unlock (&pagemap_lru_lock) ;
ACESS OHA cleaned_pages FURR 14 “VLA” AVUTINIRE AL. 5} —“P ah ABA launder_loop HIE
HERURAEIR “HE” TBA FUME. ZEA — RE HAY launder_loop 90, MURA LEU AHS
48, TRIE BL AL 1 IRIEL BRAS dirty_page_rescan ¥h(S02 47), JPA M— TKI.
AATARR “RE” TUNDRA EL —F while TIK(S0S FUT. EBT HEMMER ei ee
TM ERTL LAU BA AS I, TLRS RESIS Th PhS Ee Lee, PE RL BS Wh
Ti, RAR AT, XG maxscan HIE.
MPWA PRET IU, BACHE EM PC_inactive_dimy Mae 1, AMMA AMMAR
TOI, EIN TPE, BEDE AMUN CL 512 77). Buh. ATE AY
ial, UBER URES a ME PARE
WH RR ARE AAA “HE” AS, ULI PAREBA, see ARS
WADARS ER AL MIELE”, Al SBE PL BE Ha BA (S19 ~-525 MO RTT:
HUM ZEREA TANNER “WE” DODO UZ BU PA, WNBA T DU SCAT BRR
ie IATL L7H TU AER.
+100.lm ARAL
TUM “Aen” ARERR. TERY page SMP AEB age, HRS TSU Mal MaRS
RAK. HHRMEEAMK MA
FU AEE / "5 ROE, FOU SAD RAF 1. BOY TTT AE Rb — SE
ARORA THB. TUTTE, PA ERM Bem 1, De CRS
PEALE THT SON 1, CEM E / SC. RT AE
SSPE, AZ, PRE ART 1 SSE MER AE AL TT
Rin CEE UE EPR ATH ALE ramdisk, OVARY TEER A. IATL
SRAM BE: .
WM BB B(S31 fF). PTL TryLockPage( )iHIE 1, RRVNI ATI METTSeE. on
ME, PRTC EAR “ME” SCTE, EERO UREA. UR,
NARA, MECBRET.
ARTUR “AL” WUE 47). BN page SHAY PG_diny FEAT 1, TY ES
SCR, A A eS RE BS41~571 17). PA» ATIMBY address_space Bc
PDAS UREN, BURL A OPH page_active Mb, MTGE TEER HDA
Je A — BA TTL HR, TIBIA) address_space E48 4 JJ) swapper space.
address_space_operations $44) 79 swap_aops, JV #2 (H (5 3 thi’ tH PRE swap_writepage( ), i
2 OR" ARAB. CEN AD, HERR) DA, MPS
RH(531~535 17). RETRO, MRAM EMO SNAT. SZ aL
ClearPageDirty( 48 hilt] PG_dirty pick (icdita O, 9X feist ce VIR address_space MV MIAT
RAN RAHRES UE. RA RANT, HM Hie i, Ret
mmap( )2E CRY SCBA CER SEINE / SBE, FHA PSR ET AE
AAR ATER, BARS dicem), AETAR ARI, (ae ee AEST
Re, CELIA KAT REPRE, page launder( ), AiDLAP EB RHR PTS “Yk
Jit AARNE PG_dinty pr O MAM. RE, RRL NSA T Cb
S41 #7). HES, BERRA S WAM HTET AL, FRU RETE'S HAM RIL 1, Ae
page_launder( ) FY LAH SZ 12 IRIE PG_dirty #pi(0e FHF HIRE ARAM GBA FIP (569~-570 47 D0
WEEE TP, ELAR EL AAG writepage oh HiBY ALB page_cache_get( )i@!# sUnINO EHH
Bee Mik eR EL a I page_cache_release( BWIA TH ME, Ror TEAR NTS AE)
BI ~*BITE Ah AY DS BAR “Ph” UA,
WMA BALEA PC_dirty HRA TO. DEER, WR CPU BUA TARAS PAY 582 7, WIT
TERY PG_diny Pah 0, IAP EAE SMe RT A.
SORA “ME” 8S, JP AOR esc Pie / Sapte oe ti682~647 17), MUSE EL
ATER “AL” TURES, FEMI wry_to_free_buffers( URE RE HIK. RARER ERR
RAAB AER HE” TAS, RFRA, MAR PH” TL
HBT). WR ACT, MMO BM try_to_free_buffers( yw 1, 638 4714)
page_cache_release( )# (EHR 1 BABI T 0, Mit eA TUT AR CPB A TCA VY. Say
ARMA, HARRAH SAMOA RB RR, BALBOA
Ti 644 Fil 645 47). URES. HAA try_to_free_buffers( MI {US¢E fs/bufferc , ee
TOEFAT “OCA” ~ RU ATA.
(5) MR TAHA “BE” EA. FFALAESE +S address_space BURA HAUK I}! , LARUE S “pti”
101.
2)
6
“ay
eau i mee)
Linux BARA
TK. PROMO Pee BUTE Ie AR “Te” RM.
(©) FRG, CRAM T LAE — PBL (658 17), HORE AL ACAD IRAY TR, FLARE
TPA.
ERT MUS, KE RUER PEW LAR, USM efp_mask +i _GEP_IO
FEHR AY 1, RAMESH.
[kswapd( ) > do_try_to_free_pages( ) > page_launder( )]
el
672 i
673 * If we don’t have enough free pages, we Loop back once
674 * to queue the dirty pages for writeout. When we were called
675 * by a user process (that /needs/ a free paye) and we didn’t
676 * free anything yet, we wait synchronously on the writeout of
677 4% MAX_SYNC, LAUNDER pages.
678 *
679 * We also wake up bdflush, since bdflush should, under most
680 * loads, flush out the dirty pages before we have to wait on
681 * 10.
682 /
683 if (can_get io locks && !Jaunder_loop && free shortage( )) {
684 launder_loop = 1;
685 /* If we cleaned pages, never do synchronous 10. */
686 if (cleaned pages)
687 syne = 0;
688 /* We only do a few “out of order” flushes. */
689 maxlaunder = MAX LAUNDER;
690 /* Kflushd Lakes care of the rest. */
691 wakeup_bdflush (0) ;
692 goto dirty page_rescan:
693 )
694
695 /* Return the number of pages moved to the inactive clean list. */
696 return cleaned_pages;
697}
MURATA. BATH, BLLS| 502 {FAS dirty_page_rescan Ab. PEAIX HAE launder_loop #
AT 1, CRA ATER 2 NAIK T . ATLLEUKIAHY page_launder( ie ® 21 ¢ Maha
FFB do_try_to_free_pages( )iJfRES', ist page_launder( UUs MSR) 4} Acs TTT CATR
RAE, ABBE S EAP BENT 6 AB aL, SPAR AEH UE FT Re
TPL, TA Oe TP, a tH kT HH = eR MKC shrink_deache_memory( ).
shrink_icache_memory(). refill_inactive( )), LAR “F “FA¢S HH1AY kmem_cache_reap( HAL. 72“
BRR HY, AHA, CAT FECML B RECT SORA dentry A AH
ARE RS AH inode BRL. EUR RTE ALUM BOE
LRU SUPT, ERECT AHR ID PRET EB). GR, Sick RAL, BOA nl AE
+102.BIE fhews
BREKEW deouy SAMA inode RAH, HAMRM NMEA. ke, Rea
shrink_dcache_memory( )#I shrink_icache_memory( )i M/ONUAIBIL, ELMER RX Hoste 4 #5 388 Fa
BEAST Oi". Tt, RGA, PU eet tb ay Ba dtl AP A, eb
BRAT AGU “slab” WEA. WEA SAR, RARE ATE GR” Oe
KG. ASME ER”, MR RANETT, CHEM MAUI AR Ae.
shb FEL A TCR SOSA MEE, PERE, Hb RAL
(ASB iL kmem_cache_reap( 3K “WH”. ERATE T “ORR” BADR CRT
BRUINS, RRA IAE RUSE REE refill_inactive(), SWE mm/vmscan.c
[kswapd{ ) > do_try_to_free_pages( ) > refill_inactive ()]
Re fe
825 * We need to make the locks finer granularity, but right
826 * now we need this so that we can do page allocations
827 * without holding the kernel Lock etc.
eg Oe
829 We want to try to free “count” pages, and we want to
830 * cluster them so that we get good swap-out. behaviour.
Ble
832 * OTOH, if we're a user process (and not kswapd), we
833 really care about Latency. In that case we don’t try
824 * to free Loo many pages.
85 #/
836 static int refill_inactive (unsigned int gfp mask, int user)
87
838 int priority, count, start_count, made_progress;
839
HO count > inactive _shortage() + free shortage( );
84 if (user)
812 count = (1 <« page cluster);
813 start count = count;
ad
5 /* Always trim SLAB caches when memory gets Low. +/
846 knem cache_reap (gfp_mask) ;
87
888 priority = 6:
849 do (
850 made_progress = 0:
851
852 if (current->need_resched) {
853 set_current_state (TASK_RUNNING) ;
854 schedule( );
855 1
856
857 while (refill_inactive_scan(priority, 1)) {
858 made_progress = 1;
+ 103.Wes
Linux AYER ese
859
860
861 }
862
863 i
864 % don’t be too light against the d/i cache since
865 % refill inactive( ) almost never fail when there’ s
866 * really plenty of memory free.
867 ”
868 shrink _deache_memory (priority, gfp_mask) ;
869 shrink_icache_memory (priority, gfp_mask) ;
870
871 i
872 * Then, try to page stuff out..
873 +f
874 while (swap_out (priority, sfp_mask)) (
875 made_progress :
876 if (-count <= 0)
877 goto done;
878 }
879
880 -
881 * If we either have enough free memory, or if
882 * page_launder( ) will be able to make enough
883 * free memory, then stop.
884 4
885 if (linactive_shortage() || !free_shortage( ))
886, goto done
887
888 i*
889 * Only switch to a lower “priority” if we
£90 + didn’ t make any useful progress in the
891 * last loop.
892 */
893 if (!made_progress)
894 priority
895 } while (priority >= 0);
896
897 /* Always end on a refill_inactive.., may sleep... #/
898, while (refill inactive_scan(0, 1)) {
899 if (—count <= 0)
900 goto done;
901 }
902
903 done:
904 return (count ¢ start_count);
905]
104.Re HAR
PR user BM kswapd fe PHN, RRB A BBC kewapd_done HIN PEBPAT, ATARRE
(EL AB SHE EY A, TLR ANCE
‘HEMLAL kmem_cache_reap( )“H6CHH" Li slab HLH SLAVS MER ILM, BRITT AE REN,
RETREAT “ARLE RNR” TS ROAR TRIS.
RUB, BRP do-while FF. TAMIR 6 BIE, BabA “re” 0B, A
RRA TA, SCM T, RATAN AAA, Mth T CETL
PRR REM ITA T eee).
BAD, SOAAAAAE — PF AEF task_struct 47H) "PH need_resched JE A 1. SURE,
LEAP MAR EAP SER IABE, BTLLADAY schedule( yuk WR METT—UCiRIBE, (EUR ZEUS AE AT
HARARE AR TASK RUNNING, 2k BEERS TOM. RAED 4 Std SSAA, ask_struct $444)
"Pit need_resched AEB IM RAH, A CPU ART KARAM P AR. MARIA
BP Si RAMEE. ATA, kswapd BDARRM, AGES MIMI)”,
ARES PALIT tE CPU ASI BULA REAR “EAE READ AES RICE el AT
EAMG FFE schedule( ).
WA, TEMA PRE AE? BAAR. ALIBI refill_inactive_scan( }#I#M35 8K iti BAW),
RUA RAT AAR RI, PEASE, swap_out( RR “SEA, STH CBT
KMPER ATA ARORA ATL. Hoh, SEPARA dentry 494i inode IKKE.
‘64 refill_inactive_scan( )i#/4UE3, 3X4 HALE mmv/vmscan.c Fs
699 /xk
700 refill_inactive_scan ~ scan the active list and find pages to deactivate
11 * @priority: the priority at which to scan
702 -* @oneshot: exit after deactivating one page
m3 0 *
74 * This function will scan a portion of the active list to find
705 * unused pages, those pages will then be moved to the inactive list.
m6 */
707 int refill_inactive_scan (unsigned int priority, int oneshot)
708
709 struct list_head * page_lru;
no struct page * page;
mL int maxscan, page_active = 0:
12 int ret = 0;
713
m4 /* Take the lock while messing with the list... +/
715 spin_lock (&pagemap_Iru_lock) ;
716 maxscan = nr_active_pages >> priority;
nT while (maxscan-~ > 0 && (page_tru = active_list. prev) != dactive_list) {
18 page = list_entry(page_lru, struct page, lru):
719
720 /* Wrong page on list?! (list corruption, should not happen) */
1 if (!PageActive(page)) {
722 printk (VM: refill_inactive, wrong page on list. \n”);
723 list_del (page_lru);
+105.724
15
726
ce
128
129
730
731
732
733,
734
735
736
737
738,
739
140
741
742
743
744
745
746
747
748
749
750
151
752
7153
754
755
756
757
758
759
760
761
762
763
164
165
766
167
768
769
REG AE” ATU AE. GALL —“S eb A maxscan RESTA AT TTT BRE. 7S
Nes.
Linwx PHBE La) esa
nr_active_pages--;
continue;
}
/* Do aging on the pages. */
if (PageTestandClearReferenced(page)) {
‘age_page_up nolock (nage) :
page active = 1;
) else (
age. page down_ageonly (page) ;
ix
* Since we don’t hold a reference on the page
ourselves, we have to do our test a bit more
strict then deactivate_page( ). This is needed
since otherwise the system could hang shuffling
unfreeable pages from the active list to the
inactive dirty list and back again...
RRR RH
SUBTLE: we can have buffer pages with count 1.
*/
if (page—>age = 0 && page_count (page) <=
(page->buffers ? 2: 1) {
deactivate_page_nolock (page) ;
Page_active
} else {
Page_active =
}
}
i*
* If the page is still on the active list, move it
* to the other end of the list. Otherwise it was
* deactivated by age_page_down and we exit successfully.
*/
if (page_active || PageActive(page)) {
List_del (page_lru) ;
list_add(page_lru, &active_list);
} else f
ret = 1;
if (oneshot)
break;
)
}
spin_unlock (&pagemap_Iru_lock) ;
return ret;
}
+ 106 -Ble telewn
RRA ELE SRILA], TLRS AL priority MULE ASO, RATE
prlority 7 0 BY A FLAMES ASIC 716 7). RF STAR OA) I, BE SP IHC, 72
‘ide MR, AUREL ASLAM 729 F)- BENE MURR LG Ae. MURAD SAF
ERAT 0. MARMARA CARMA LE GA, BCR TA, BM RR T
UT ROCHA OCRAE ATR, OR RTS ATRIAL. RIUM AA
SRI / SAR, MAR EOE ROK 1 ROLF PIRI, IEA RENE A ANI
BARS IL 144 47), IEA AVTUTT AEST owap_out( EAH MUINERRISRIN AREA MERE A TODOS. XE
FHERREMATRSORAMUIURL, ERI IMSUP AY HAC BE SIOA UO. LZ, MR RH
ARMA T AERA, RUHL BM oneshot (OULIE HE AAT. ARORA, CERSRE LTD IL
POSE RTM ASE 1. 124 swap_out( BF TF—7> CORY WET CHORE A ASRARASIT, UE
RAAT RATL, RATER TUT OTA, SOUR PRET WSCCETERPAR
hte. He HMA” wT LR
18H swap_out( ) AVEAE mmivmscane 4H Les
{kswapd( )> do_try_to_free_pages( ) > refill_inactive () > swap_out( )]
207 Je
298 * Select the task with maximal swap ent and try to swap out a page.
299 * NB, This function returns only 0 or 1. Return values != 1 from
300 * the Iover level routines result in continued processing.
301 ef
302 tidefine SWAP_SHIFT 5
303 define SWAP_MIN 8
304
205 static int swap_out (unsigned int priority, int gfp_nask)
306 {
307 int counter
308 int __ret
309
H0 (*
a * We make one or two passes through the task list, indexed by
a2 * assign = {0, 1}:
a3 * Pass 1: select the swappable task with maximal RSS that has
a * not yet been swapped out.
v5 * Pass 2: re-assign rss swap ent values, then select as above.
316 *
aT * With this approach, there's no need to remember the last task
us * swapped out. If the swap-out fails, we clear swap_ent so the
ag * task won't be selected again until all others have been tried.
320 *
2 * Think of swap_ent as a “shadow rss” - it (ells us process
gee % we want to page out (always try largest first).
we */
4 counter = (nr_threads << SWAP_SHIFT) >> priority;
325 if (counter < 1)
+107.Nes.
Linux PBA iit) sear
counter = 1;
for (; counter >= 0; counter--) {
struct List_head *p;
unsigned long max_cni
struct mm_struct #best
int assign = 0;
int found_task
select:
spin_lock (tmmlist_lock) ;
p = init mn, nnlist. next;
for (: p != Ginit_mn nmlist;
struct mm struct *nm
if (om->rss < 0)
continue;
Found task++;
/* Refresh swap cnt? 4/
if (assign == 1) [
nim->swap_ent. = (nm->rss >> SHAP_SHLFT);
if (om->swap ent < SWAP MIN)
tmm->swap_ent = SWAP_MIN:
= p>next) {
ist_entry(p, struct mm_struct, mmlist);
}
if (am->swap_ont > max_ent) {
mmax_ent = mm->swap_ent
best = am;
}
/* Make sure it doesn’ t disappear ¥/
if (best)
atomic_inc (Gbest—>ma_users) ;
spin unlock (émml ist_lock) :
is
* We have dropped the tasklist_lock, but we
* know that “mm” still exists: we are running
* with the big kernel lock, and exit_mm( )
* cannot race with us.
*/
if (best) {
if (lassign && found task > 0) {
assign = 1;
goto select;
b
break;
} else {
__ret = swap_out_mm(best, gfp_mask) ;
mmput (best) ;RIE Hho
a4 break;
375 }
376 }
3m? return __ret;
378}
PR ME —*T for I, HRMAURUE counter, MM counter CLAN SHE
SEARS) BOSSA swap_out( ISOLS624 EADY 6 2, BER LITE OME) UFR ET AD, tRaGeE
AO, counter HFT (nr_threads<< SWAP_SHIFT), Ell 32% nr_threads, 1X4! nr_threads 2% e
HUPEAD CRE. A ARR EAE TO AA BA, RET She OA IKE. BAL gtp_mask
"EER HAT Mh
TERI, EEE ERE PIES) — NEARY best. $B) T at Hse PUY
TUMOR A. HEE AP ml PUTT ET, RE OE ARBORS, 8 t
RPS PBR EEA.
SES IRATE. GES ACALIRAY “swap_out”, (HSER ESL Ae we mie Sse EEE
HER, AR pe A TI, PLL PTR TAN HR FE MA. MA HE
FRAME “ARATE” (REPRE? WPCA DLA “BURRS” GS RRUAREE” GSES. AE TERA
HOS AME SU, AEP LL RSPR T BRA ETI MR. EEA — META,
BRA HE AS ITE NN NE TT A ABE A A, TESA EE SPR. RE
SAR “SE PY IRS” (resident set), SLAB rss. TEP HEE mm_struct PA PASE
ARGUE SR ha OATH rss Bet TAD RAS. TORR EEN Ti Aa RAS IL
toatik Se
FURS LY Ae for HH RATA IB SLATE ARIST AMEE. ACP PEL task_struct SAAB
SURE —MAB. DEAE initoask JOA BANU — (SEER, LATER ie. EN
JETT, SERB “RMU”. BILL, AA init_task. next_lask J672 init_task 1h. BALAEEBR
ERRATA ERD. RIOD BE MPA mm-oswap ent YAIR. ES mm_struct SiH PAVE
AME, Ce aE ARNG PE INT ALTE Td, IA TTT mm_struce SDT ADS AB AR
TOUR AE T A, Rak aT AE Rk Fe ULE mors. ARIK“ A"
Bi. SARE S/S — ASTI, BASAL mm->swap_ent ik 1, ALARA O. TEL,
mm->rss BET NERY ARV ALTE TURES, TH mm->swap_ent WRT dere: RNY fF HCA
HPPOKLAAROROEE, RECA CPR DLA THAMMMMKENSR, RK
ARE) AS EERE He". AYALA mm->swap_ent MALT 0, Mig HE VE AA" best”
A 439~444 tr), PHO AAE assign Bim 1, Ag 3k- VOR EERE HT 9 mm->rss
7H] mm->swap_ent P, RE BME TSAI (DE, PA OY OS LE
ROE AE” BUG BUR SP ASTD A CTR ALY > AAT TETAT, SAS oe BE) FIR A
SURE" DURA BURR. RR ERY Pat A EAN TH APC he be:
AH A He A TT A&A SR Sw I} Hy TB SAE AE) swap_out( )
(YS RT RDA AE TLIO. A Ae er TERR TREAT A A RTO
PRE“ RAERTR” best DUG. BRAUER A, ETT AE TU 0
if OS HAE et swap_out_mm( )3% TERRE. 29 swap_out_mom )F7/j lie} 4 TERED Rh Mf
ASWELL 0, JAIL RU EAE. ERR LEZ AT SELL 356 47097 atomic_inc( ) i&!# mm_struct FFF O4tE
+109.Nes.
Lay esa
HUE mm_users #56 RUUG Fhe 373 47 mmput( YAREGIR, (XMM Aa TERRE MRIS T
SMP, IMATE NR BRA.
PRC swap_out_mm( )iHI{RE5H4 vmscan.c 4
[kswapd() > do_try_to_free_pages( ) > refill_inactive () > swap_out( ) > swap_out_mm( )]
257 static int swap_out_mm(struct mm_struct * mu, int gfp_mask)
258
259 int result
260 unsigned long address;
261 struct vm area_struct® vma;
262,
263 i*
264 * Go through process’ page directory.
265 */
266
267 is
268 * Find the proper vm-area after freezing the vma chain
269 * and ptes.
270 ”/
2 spin_lock (tmm-Ypage_table_Lock)
272 address = am->swap_address;
273 vna = find_vma(mm, address) ;
274 if (vma) {
275 if (address < yma->vm_start)
276 address = vma->vm start;
27
278 for Gi) {
219 result = swap_out_vma(om, vma, address, gfp mask);
280 if (result)
281 goto out unlock;
282 yma = va >vm_next;
283 if (tvma)
284 break:
285 address = vma~>vm start:
286 3
287 }
288 /* Reset Lo 0 when We reach the ond of address space */
289 n->swap_address = 0;
290 ma->swap_ent = 0;
291
292 out unlock:
293 spin_unlock (@nm->page_table_lock) ;
294 return result;
295}
F3ti, mm->swap_address Hav TEA TIMER Y BEROE AS RANT ik. AINA I 0, ABE
“HO.RI Amn e
FIN BOS RT aL a AR O CR 289 47). FERRE for RP HRA TIA IL
ARSUSUOTZENY AEA? At va, AUS RLIBJH swap_out_vma( )istHdBeih -oiti. WIA GBI] 1D, Le
MES RIERT « AUR EPR hl. BRA TUE PAL, #8ik swap_out_vma( ),
swap_out_pgd(). swap_out_pmd(), —Hl try_to_swap_out(), iA Beit di —7 ZA pre PTI
Ae Ki. PAR ILP SRE TE, ROTM RR, RERNARKE
tyto_swap_out(), ARBRE. Pili, RM -L—- PRE EME TH we
[kswapd( ) > do_try_to_free_pages( ) > refill_inactive () > swap_out( ) > swap_out_mm( )
> swap_out_vma( ) > swap_out_pgd( ) > swap_out_pmd( ) > try_to_swap_out( )]
at fe
28 * The swap-out functions return 1 if they successfully
29 * threw something out, and we got a free page. It returns
30 * zero if it couldn’ t do anything, and any other value
31 * indicates it decreased rss, but the page was shared.
a2 *
33 * NOTE! Tf it sleeps, it *must® return 1 to make sure we
34 don’t continue with the swap-out. Otherwise we may be
35 * using @ process that no longer actually exists (it might
36 * have died while we slept).
Mw e/
38° static int try_to_swap_out (struct mm struct * mn,
struct vmarea struct® vma, unsigned long address,
pte_t * page table, int gfp_mask)
3
40 pte_t pte:
4l swp_entry_t entry;
2 struct page * page;
% int onlist:
4
45 pte = *page table:
46. if (pte present (pte))
0 goto out failed;
48 page = pte page (pte):
49 if ((IVALTD PAGE(page)) || PageReserved (page) )
50 goto out_failed:
51
52 if (!mm->swap_ent)
53 return 1;
5
55 mmn->swap_ent—;
5B,
AE, BH page_table Kin LAAI— AMMAN. MAAK, SA page table HAR
Bo UR PAN A AIA ER pte VE, HMR pte_presemt( PRM AAT ITH HEME TH EA
FER, MURA TE TEL ARES A] out_failed, AUKERE RAM Ts
“mM.Linux APA fh
106 out_failed:
107 return 0:
i ry_to_swap_out( ln O Rf, FUERA RBS, Tt Pm de
RAO - TRE. WR TMHREAIR, AAG HB-B bP.
RL, RH TEAL AEH , LDL pte_page yi LIM BETAY A A Me ARENT NT
19 page SHH. HRPM page S8FIMZE mem_map Hip, HFCL page - mem_map)itJEi% HE
NURS CRRA Fin). BARA E SA RAR A TE MES max_mapnr, ASA Ak — 7
WORK, AM RAN ANE MOORE ME), PREM ok.
118 define VALID_PAGE (nage) (page ~ mem map) < max_mapnr)
JESS, ARATE FET AS AOR hy ET SE
Beal AAR, PAAR LTT. BTELAF mm-sswap_cnt WK 1. SARE RE
try_to_swap_out( )iM{ UH:
[kswapd( ) > do_try_to_free_pages( ) > refill_inactive ( )> swap_out( ) > swap_out_mm( )
> swap_out_vma( ) > swap_out_pgd() > swap_out_pmd( ) > try_to_swap_out( )}
aT onl ist. = Pagedctive (page) ;
8 /* Don't look at this pte if it's been accessed recently, #/
59 if (ptep_test_and clear_young(page table) {
60 ‘age_page_up (page) ;
él goto out_failed;
«2 }
6 if (lontist)
64 /* The page is still mapped, so it can’t be freeable... */
6 age page_down_ageonly (page) ;
66
6r Ix
68 % If the page is in active use by us, or if the page
6 + is in active use by others, don’ ( unmap it or
70 * (worse) start unneeded 10.
7 ”/
2 if (page-age > 0)
B goto out failed;
m4
WTP RUIN page 2A, PAR Mags | MRAP OA AY TARAS. Heeb By PG_active ti
BURR MTAT RM HE", AE TE activelist BSI:
230 #def'ine PageActive (page) —_test_bit (PG_active, &(page)->flags)
AT SCR ETM — GEER LRU BARI, ARLE active list GA 3) PBR TRA — HE
inactive_dirty_list ‘1 B@4E 7 inactive_clean_list |', E> “FAR BE HMUMAR,
“Mm.Bl mee
— PR PE a, WORT IA RT EH) PT. RRL inline ef
‘& ptep_test_and_clear_young (iit (3FIH 0) AY. Haz X7E include/asm-i386/pgtable.h "P+
285 static inline int ptep_test_and_clear_young (ple_t *ptep)
{ return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep); |
AUNT, SiH S_PAGE_ACCESSED i243. 44 1386 CPU IPS FBR OLA it aS
HDA HG — Fi FSA SIL HS HH: 5 AT Vd SRS lH We RE TL AN
PAGE_ACCESSED fran 1. AFL, WR pte_young( Bll 1, BEARS Bort] 4 RTE
WAH ty_to_swapou( 4, WRMBDOAww--k, DELP “He”. A, Bae
LATTA MR MATTER AIRES A A, OPA RR. AE TM, AEE
(21h (_PAGE_ACCESSED Peak fvii& 0. AEE S AA, y bE DGA Matin
Pe
MORE “ARE”, ARE EE I DUR S, ATLABBEREH out_failed. Ait, A463)
outfailed 2AELR— AHI: MAR TIMER, MLL SetPageReferenced( y+ page SARAH HU
PG.referenced PeAAL FEL Le HR AAR. HG TCT ZU Hp Re as SE BNL ew fs A He Se
PTAA TSU, NWI RE age_page_upt )SHH A Tif HT OL RAP 9k “CAN ease” ASIST
Dy EARP TM AE UAE ALL HG lo
[kswapd( } > do_try_to_free_pages( ) > refill_inactive ( ) > swap_out( ) > swap_out_mm( ) > swap_out_vma( )
> swap_out_pgd( ) > swap_out_pmd( ) > try_to_swap_out( ) > age_page_up )]
125 void age_page_up (struct page * page)
126
17 *
128 # We're dealing with an inactive page, move the page
129 * to the active list.
130 */
131 if (page->age)
132 activate_page (page) :
133
134 /* The actual page aging bit */
135 page>age += PAGE_AGE_ADV;
136 if (page-Page > PAGE_AGE_MAX)
137 page-Dage = PAGE_AGE_MAX:
Bs}
$5) oucfailed LUa, MPMALGEMLO, Uw eR TM. EE, BF He RRR
SABRE TRS, HR) — FMA pte Ht f_PAGE_ACCESSED Pa (208 4) 0, ABA LAAT
“Oe” 7. SEP la), WIA RT EO RR EE
BP) EASA IEA NE? LUA aS He do_swap_page( 4 EH), “Hb Hal FA TTR
SL (SAGE AR TUT BD BRAS IN) FEAR VOL iG ER OU BA B,C A eB
page_launder(), eb Be (i FAA HCBE EVA AEE, ULAR PRI ATA AE ATER To
SUR TURILAAR “ARS, ARR APT NAR, REPL SA
U3.ire
Limax ABEL ath) esi
AG ARAGLCC RMA, DEE “OUR” BILE. LAGE? TAIL page->age (HL,
80 FC he «RS A RH EB SL age_page.down_ageonly( ik 36% fr
Gnmiswapc):
103 /#
104 # We use this (minimal) function in the case where we
105 -* know we can’t deactivate the page (yet).
106 #/
107 void age_page_down_ageonly (struct page * page)
10g
109 page-Dage /= 2;
lo}
FUE page->age MAAS 0, BUCA AE TRH, PTL HSL F! out_failed.
BALA, APT MAM LABS T HART. BARE Tet,
[kswapd() > do_try_to_free_pages( ) > refill_inactive ( ) > swap_out( ) > swap_out_mm( ) > swap_out_yma( )
> swap_out_pgd() > swap_out_pmd( )> try_to_swap_out()]
15 if (TryLockPage (page))
78 goto out_failed:
7
78 /* From this point on, the odds are that we're going to
9 * nuke this pte, so read and clear the pte. This hook
80 * is needed on CPUs which update the accessed and dirty
81 * bits in hardware,
a2 */
83 pte = ptep. get and clear(page_table) ;
84 flush, t1b.page(vma, address) ;
85
86 ie
87 # Is the page already in the swap cache? If so, then
88 % we can just drop our reference to it without doing
89 * any TO - it's already up-to-date on disk.
90 *
9 * Return 0, as we didn’ t actually free any real
92 %* memory, and we should just continue our sean.
93 */
” if, (PageSwapCache(page)) {
95 entry. val = page->index:
96 if (pte dirty (pte))
or set_page_dirty (page);
98 set_swap_pte:
99 swap_dupl icate (entry) ;
100 set_pte(page_table, swp_entry_to_pte(entry));
101 drop pte:
102 UnlockPage (page) :
iia.2 dei
103 m->rss—~;
104 deactivate page(page) ;
105 page cache release (page) ;
108 out failed:
107 return 0;
108, 5
FY page SEGRE Ri, AR AR RTT ERE, PLR Ht
‘TryLockPage( )#¥ page SUR HLL (include/linux/mm.h):
183 #def'ine TryLockPage (page) test and set bit (PG locked, &(page) >flags)
MRE PEL 1, BFA PG locked MSM GORMOBA 1, BARMERA T, lea akAS
EASULMLIK >} page BURA, MM RARE.
DMAP EL es. BLY LARA TCT A ASTRA a AE aT
HFM T ptep_get_and_clear( )PEK—U HUN MA AE, FH AeA AK O, BALM ATE
OB. GTA 45 TOS T AMARA, HARASS -IK, MMU
OR? EMMA SH, ASMA OTA CE + CPU Lse4T, PILLS ROMA OTE
Bae,
WRAY page AMAA RRA / MTA, WAAEEY swapper_space
AMBP, BARMMAS ABER, REORA AIT, Zea H US
EPRM, AVAT. Bel, AMARA / RMR TMI IR” AHR” ES, FLL
RUM CHL S Hol RBA A set_page_diny( HI A “HE” TOTO. ARF PageSwapCache ( )
{XA Cinclude/linux/mm.h):
217 define PageSwapCache (page) test bit (PG swap cache, &(page)—>flags)
Biae( PG_swap_cache 7) 1 27% page HMI swapper_space BLIP, UOTE TAL AT
HRA / HUTT. HOI page SHIEH index PREP 32 CADRSLM sup_entry 1, SeoE BAT
KATIA (OBRINT . FAK swap_duplicatel EM, -HALBERNRAIRINATEE BES,
SERA MAAR LTE REM. SAREYAE mm/swapfile.e
[kswapd() > do_try_to_free_pages( ) > refill_ inactive () > swap_out) > swap_out_mn( ) > swap_out_vma( )
> swap_out_pgd{ ) > swap_oul_pmd( ) > try_to_swap_out( )> swap_duplicate()}
820
821 Verify that a swap entry is valid and incroment its swap map count.
822 # Kernel_lock is held, which guarantees existance of swap device.
63
824 Note: if swap_map[ ] reaches SWAP MAP MAX the entries are treated as
825 * “permanent”, but will be reclaimed by the next swapoff.
6 /
827 int swap_duplicate(swp entry t entry)
ee |
-uS-Nes.
Linux ABER ba cscs
929 struct swap_info_struct * p;
830 unsigned long offset, type:
831 int result = 0.
832
833 /* Swap entry 0 is illegal */
834 if (lentry. val)
835 goto out;
836 type = SHP_TYPE (entry);
837 if (type >= nr swapfiles)
838 goto bad file;
839 ype + svap_info;
840 set = SHP_OFFSET (entry) :
BAL if (offset >= p->max)
B42 goto bad offset;
843 if (!p->swap_map[offset ])
844 goto bad_unused;
845 ie
846 * Entry is valid, so increment the map count.
847 +/
848 swap_device lock (p) 5
49 if (p~>swap_map[offset] < SWAP MAP_MAX)
850 p>swap_map[offset)++;
851 else {
852, static int overflow =
853 if (overflow++ ¢ 5)
854 printk(’WM: swap entry overflow\n”);
855 p->swap_mapLorfse.} = SWAP MAP MAX;
856 t
857 swap_device unlock (p) ;
858 result = 1;
859 out:
860 return result;
861
862 bad_Tile:
863 printk("Bad swap file entry $081x\n", entry. val);
864 goto out:
865 bad offset
866 printk ("Bad swap offset entry $081x\n", entry. val);
867 goto out:
868 bad_unused:
869 printk("Unused swap offset entry in swap dup MO81x\n", entry. val) ;
870 goto out;
sm}
DhaniRRL. BRAY swp_entry_t ER EAE 32 HIG SHEAR, HMR MY AEA EO, tee fe HG A
HEHE O, HAMS COPEL offset WES LATIMES, KRAIT ROLLE type RISES AIR eA
SHES. CURL, HALE type Sef 5 “RAY ER, RIES.
U6.B28 He
VLA PR, BERT EPH EPMO AEA swap_info PARAL IR BLA swap_info._ struct SLAY. BP
AREA MI SR EYMEEAL swap_map J» UCLA BEE A UA OS MAPS a
RRCKERA RE | KURA 0, HMR, BAH, BILL AS
SWAP_MAP MAX. 22a E SURWJU35i| Bete ik THEMES TR
FS try_to_swap_out( )AUEE I. 100 FR] set pte)» FERMI L TUTOR LTTE A EAS
TUAER, RACAL PTE RTA ER TAT ETCH. BCE, ABUT EARS drop_pte HH
ELYaSUEASAVSE Wy IESE A os Bm TST. cs T RATTLE TRE ARATE
AAT EWTRROUMK EE, PALLET deactivate_page( JA AKAMA SBM RII, JF
ETL page SEMA ISER DLAI PUPAL A RA ANSER HMI Cmmswap.c)=
kswapd( ) > do_try_to_free_pages( ) > refill_inactive () > swap_out( ) > swap_out_mm( } > swap_out_vima()
> swap_out_ped( ) > swap_out_pmd( ) > try_to_swap_out( ) > deuctivate_puge ()]
389 void deactivate_page (struct page * page)
1 {
191 spin_lock (apagemap_Lru_lock) ;
192 deactivate_page_nolock (page) ;
193 spin_unlock (kpagemap_lru_lock) :
1}
[kswapd( ) > do_try_to_free_pages( ) > refill_inactive ( )>swap_out( ) > swap_out_mm( ) > swap_out_vma( )
>swap_out_pgd( )>swap_out_pmd( ) > try_to_swap_out( ) > deactivate_page ( ) > deactivate_page_nolock( )]
154 ee
155 * (de)activate page ~ move pages from/to active and inactive lists
156 * Ghage: the page we want to move
187 @nolock ~ are we already holding the pagenap_Iru lock?
1B
159 * Deactivate_page will move an active page to the right
160 inactive list, while activate page will move a page back
161 * from one of the inactive lists to the active list. If
162 * called on a page which is not on any of the Lists, the
163 page is left alone.
164 /
165 void deactivate_page_nolock (struct page * page)
1660 {
167 *
168 * One for the cache, one for the extra reference the
169 * caller has and (maybe) one for the buffers.
170 *
7 * This isn't perfect, but works for just about everything.
i * Bosides, as long as we don’ L move unfreeable pages to the
173 * inactive clean list it doesn’ | need to be perfect..
174 */
1 int maxcount = (page—>buffers ? 3: 2);
176 page-rage = 0;
UT.Linux ARG Lap
177 ClearPageReferenced (page) :
178
179 i
180 * Don’ t touch it if it’s not on the active list.
181 * (some pages aren't on any list at all)
182 */
183, if (PageActive(page) && page_count (page) <= maxcount &&
!page_ramdisk(page)) {
184 del_page_from_active_list (page) ;
185 add_page_to_inactive_dirty_list (page) ;
186 }
187
TEMPE page BEEP AMAR count, SITAR 0, EAA INE 1
CHL__alloc_pages( )BI rmqueuet )f#/845 9, se Je 4 CIAL AM — 4S FEL”, we 3 — NB
LE count AN 1. GA, MRIS BNA 2, RTL INT FRA Ce te ae TC AR
OY. BERG ASAE AE, RO RE EKO SBR ERE 2 HEA ME,
WEE ILI maxcount. (HE, AEE — APPAR, BORA ALAS mmap( BRIA
JSF, MRP AER OT I, RASC PRE a, BK SIN PR / ICE AS
2, DET RRs PRE, HL page MPH ET buffers HI—% buffer_head SARAHB,
FAA WL TAR BITS AS HU. BFLA, “4 page->buffers Jf 0 If, maxcount 9 3 BLAM,
FEROBAT AEA WY 77 SUD OSE — TAS 5 Sh PAP URI AT AT REF F ramdisk, BULA AB Sd ES
SAME, KEORDAMAS RMR, A. ARMAS, RAT SK
SEMA AAT ER AAT ERI TL. SCH a A RRA lt,
MERE AT ARERR AS. TUB, MARSH tb TBE, RE AS AE HBR BA 3 eb 0 TT PF 2k
deactivate_page_nolock( ) jf A #4.
He VARMA RAAT, BETA TAY page Sit MISE IMM LRU BAS active list PF
BI-TAARORPA, OE, RAPA MMAR. ARE “dirty”, RUG RE RE LS
Ah, BRC LR RAS BA AE” TAS, RATS LeROR ATA, I
PARES MATAMEE “We”. Ti HUE “clean”, RIERA S LAT BUY “TH” RL
SS. ROPE GUN be) Hy A RRR, QL TCT PL AA, ld AER
fe PARLE]. ANGER “ME” TIARA —+>, AHL AE imactive_dirty_list: i 7AWARK “+7” TB
WARS, HP WMHSAK PMA inactive cleanlist I). BA, 4 —-P SINGER A
RY, REHAB T API? BMRA E RA “AL” HR IBAT. 3-4 page HEA.
TR BR OFM AE AE LEARY del_page_from_active_list( )7CAKA. J&5E X7F include/linux/swap.h PF:
234 ttdefine del_page_from active list (page) { \
235, List_del @(page)—>1ru) ; \
236 ClearPagesctive(page) ; \
237 nr_active pages--; \
238 DEBUG ADD_PAGR \
239 ZERO_PAGE_ BUG \
200)
UB.M2 Bie
He page SKTGEA AARON, SMITH add_page_to_inactive_dirty_list( )75 Rk:
217 f#define add_page to_inactive_dirty_list(page) { \
218 DEBUG_ADD_PAGE \
219 Z2ERO_PAGE_BUG \
220 SetPageInact iveDirty (page); \
221 List_add(& (page)->lru, &inactive_dirty_list): \
222 nr_inactive dirty pagest+: \
223 page->zone-> inactive dirty_pages++; \
wmf
iK EM ClearPageActivet yA SetPagelnactiveDirty( )71 15 page 19409 PG_active frats(iLit i 0
‘A0% PG_inactive_diny HARA 1, ERATED page SP MHUT AAR.
LEIA) wy_to_swap_out( )AOACESh, BARITIE TAA PDE, SLES 7 ICT
WUHITM. iRALETZIMP page_cache_release( ). S/7_t AE Hi__free_pages( TEMA.
3 #define pagecache release (x) __free page(x)
379 define __free page (page) __free_pages (