리눅스 커널 파일 시스템 [3] 코드 분석 - page_cache_sync_ra()
void page_cache_sync_ra(struct readahead_control *ractl,
unsigned long req_count)
{
pgoff_t index = readahead_index(ractl);
bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
struct file_ra_state *ra = ractl->ra;
unsigned long max_pages, contig_count;
pgoff_t prev_index, miss;
/*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
* readahead will do the right thing and limit the read to just the
* requested range, which we'll set to 1 page for this case.
*/
if (!ra->ra_pages || blk_cgroup_congested()) {
if (!ractl->file)
return;
req_count = 1;
do_forced_ra = true;
}
/* be dumb */
if (do_forced_ra) {
force_page_cache_ra(ractl, req_count);
return;
}
max_pages = ractl_max_pages(ractl, req_count);
prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
/*
* A start of file, oversized read, or sequential cache miss:
* trivial case: (index - prev_index) == 1
* unaligned reads: (index - prev_index) == 0
*/
if (!index || req_count > max_pages || index - prev_index <= 1UL) {
ra->start = index;
ra->size = get_init_ra_size(req_count, max_pages);
ra->async_size = ra->size > req_count ? ra->size - req_count :
ra->size >> 1;
goto readit;
}
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
rcu_read_lock();
miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages);
rcu_read_unlock();
contig_count = index - miss - 1;
/*
* Standalone, small random read. Read as is, and do not pollute the
* readahead state.
*/
if (contig_count <= req_count) {
do_page_cache_ra(ractl, req_count, 0);
--> 핵심 함수이다.
return;
}
/*
* File cached from the beginning:
* it is a strong indication of long-run stream (or whole-file-read)
*/
if (miss == ULONG_MAX)
contig_count *= 2;
ra->start = index;
ra->size = min(contig_count + req_count, max_pages);
ra->async_size = 1;
readit:
ractl->_index = ra->start;
page_cache_ra_order(ractl, ra, 0);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
/*
* do_page_cache_ra() actually reads a chunk of disk. It allocates
* the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
static void do_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct inode *inode = ractl->mapping->host;
unsigned long index = readahead_index(ractl);
loff_t isize = i_size_read(inode);
pgoff_t end_index; /* The last page we want to read */
if (isize == 0)
return;
end_index = (isize - 1) >> PAGE_SHIFT;
if (index > end_index)
return;
/* Don't read past the page containing the last byte of the file */
if (nr_to_read > end_index - index)
nr_to_read = end_index - index + 1;
page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
--> 핵심 함수이다.
}
/**
* page_cache_ra_unbounded - Start unchecked readahead.
* @ractl: Readahead control.
* @nr_to_read: The number of pages to read.
* @lookahead_size: Where to start the next readahead.
*
* This function is for filesystems to call when they want to start
* readahead beyond a file's stated i_size. This is almost certainly
* not the function you want to call. Use page_cache_async_readahead()
* or page_cache_sync_readahead() instead.
--> low level 함수라는 뜻. 보통의 리눅스 커널 개발자가 이 함수를 사용하거나 고칠 일은 없다.
*
* Context: File is referenced by caller. Mutexes may be held by caller.
* May sleep, but will not reenter filesystem to reclaim memory.
*/
void page_cache_ra_unbounded(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct address_space *mapping = ractl->mapping;
unsigned long ra_folio_index, index = readahead_index(ractl);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
unsigned long mark, i = 0;
unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);
/*
* Partway through the readahead operation, we will have added
* locked pages to the page cache, but will not yet have submitted
* them for I/O. Adding another page may need to allocate memory,
* which can trigger memory reclaim. Telling the VM we're in
* the middle of a filesystem operation will cause it to not
* touch file-backed pages, preventing a deadlock. Most (all?)
* filesystems already specify __GFP_NOFS in their mapping's
* gfp_mask, but let's be explicit here.
*/
unsigned int nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
index = mapping_align_index(mapping, index);
/*
* As iterator `i` is aligned to min_nrpages, round_up the
* difference between nr_to_read and lookahead_size to mark the
* index that only has lookahead or "async_region" to set the
* readahead flag.
*/
ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size,
min_nrpages);
mark = ra_folio_index - index;
nr_to_read += readahead_index(ractl) - index;
ractl->_index = index;
/*
* Preallocate as many pages as we will need.
*/
--> page들을 allocate 한다.
while (i < nr_to_read) {
struct folio *folio = xa_load(&mapping->i_pages, index + i);
int ret;
if (folio && !xa_is_value(folio)) {
/*
* Page already present? Kick off the current batch
* of contiguous pages before continuing with the
* next batch. This page may be the one we would
* have intended to mark as Readahead, but we don't
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
read_pages(ractl);
ractl->_index += min_nrpages;
i = ractl->_index + ractl->_nr_pages - index;
continue;
}
folio = filemap_alloc_folio(gfp_mask,
mapping_min_folio_order(mapping));
if (!folio)
break;
ret = filemap_add_folio(mapping, folio, index + i, gfp_mask);
if (ret < 0) {
folio_put(folio);
if (ret == -ENOMEM)
break;
read_pages(ractl);
ractl->_index += min_nrpages;
i = ractl->_index + ractl->_nr_pages - index;
continue;
}
if (i == mark)
folio_set_readahead(folio);
ractl->_workingset |= folio_test_workingset(folio);
ractl->_nr_pages += min_nrpages;
i += min_nrpages;
}
/*
* Now start the IO. We ignore I/O errors - if the folio is not
* uptodate then the caller will launch read_folio again, and
* will then handle the error.
*/
read_pages(ractl);
--> IO가 여기에서 시작된다.
filemap_invalidate_unlock_shared(mapping);
memalloc_nofs_restore(nofs);
}
static void read_pages(struct readahead_control *rac)
{
const struct address_space_operations *aops = rac->mapping->a_ops;
struct folio *folio;
struct blk_plug plug;
if (!readahead_count(rac))
return;
if (unlikely(rac->_workingset))
psi_memstall_enter(&rac->_pflags);
blk_start_plug(&plug);
if (aops->readahead) {
--> readahead는 address_space_operations의 하나이다. 각 파일시스템에서 이 함수를 구현한다.
ext4의 경우 .readahead = ext4_readahead 이다. 여기부터는 각 파일시스템의 영역이다.
aops->readahead(rac);
/*
* Clean up the remaining folios. The sizes in ->ra
* may be used to size the next readahead, so make sure
* they accurately reflect what happened.
*/
while ((folio = readahead_folio(rac)) != NULL) {
unsigned long nr = folio_nr_pages(folio);
folio_get(folio);
rac->ra->size -= nr;
if (rac->ra->async_size >= nr) {
rac->ra->async_size -= nr;
filemap_remove_folio(folio);
}
folio_unlock(folio);
folio_put(folio);
}
} else {
while ((folio = readahead_folio(rac)) != NULL)
aops->read_folio(rac->file, folio);
}
blk_finish_plug(&plug);
if (unlikely(rac->_workingset))
psi_memstall_leave(&rac->_pflags);
rac->_workingset = false;
BUG_ON(readahead_count(rac));
}
static const struct address_space_operations ext4_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
.dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
.release_folio = ext4_release_folio,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
여기까지 read system call 부터 각 파일시스템에 도달하기까지의 흐름을 살펴봤다.read system call은 ext4 파일시스템의 ext4_file_read_iter()를 부르며, storage에 direct access 하는 것이 아닌 경우 generic_file_read_iter()를 타서 page cache가 존재하는지 (storage의 데이터가 메인메모리(DRAM)에 캐싱되어 있는지) 먼저 살펴본다.
page cache가 있다면 page cache로부터 읽는다.
page cache가 없다면 page_cache_sync_readahead()를 이용하여 storage에서 읽어온다. (앞으로 이후 순차적인 데이터를 storage data에서 더 읽어올 것으로 예상되면, filemap_readahead()가 동작해서 async하게 storage에서 데이터를 더 읽어온다.)
page_cache_sync_readahead()는 최종적으로 aops->readahead 함수를 부르게 되고, 이는 각 파일시스템의 readahead 함수를 부르게 한다. ext4 파일시스템의 경우 ext4_readahead가 불리게 된다.
댓글
댓글 쓰기