리눅스 커널 파일 시스템 [3] 코드 분석

리눅스 커널 파일 시스템 Read [1]
리눅스 커널 파일 시스템 Read [2]
리눅스 커널 파일 시스템 Read [3]
리눅스 커널 파일 시스템 Read [4]

void page_cache_sync_ra(struct readahead_control *ractl,
		unsigned long req_count)
{
	pgoff_t index = readahead_index(ractl);
	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
	struct file_ra_state *ra = ractl->ra;
	unsigned long max_pages, contig_count;
	pgoff_t prev_index, miss;

	/*
	 * Even if readahead is disabled, issue this request as readahead
	 * as we'll need it to satisfy the requested range. The forced
	 * readahead will do the right thing and limit the read to just the
	 * requested range, which we'll set to 1 page for this case.
	 */
	if (!ra->ra_pages || blk_cgroup_congested()) {
		if (!ractl->file)
			return;
		req_count = 1;
		do_forced_ra = true;
	}

	/* be dumb */
	if (do_forced_ra) {
		force_page_cache_ra(ractl, req_count);
		return;
	}

	max_pages = ractl_max_pages(ractl, req_count);
	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
	/*
	 * A start of file, oversized read, or sequential cache miss:
	 * trivial case: (index - prev_index) == 1
	 * unaligned reads: (index - prev_index) == 0
	 */
	if (!index || req_count > max_pages || index - prev_index <= 1UL) {
		ra->start = index;
		ra->size = get_init_ra_size(req_count, max_pages);
		ra->async_size = ra->size > req_count ? ra->size - req_count :
							ra->size >> 1;
		goto readit;
	}

	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
	rcu_read_lock();
	miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages);
	rcu_read_unlock();
	contig_count = index - miss - 1;
	/*
	 * Standalone, small random read. Read as is, and do not pollute the
	 * readahead state.
	 */
	if (contig_count <= req_count) {
		do_page_cache_ra(ractl, req_count, 0);
        --> 핵심 함수이다.
		return;
	}
	/*
	 * File cached from the beginning:
	 * it is a strong indication of long-run stream (or whole-file-read)
	 */
	if (miss == ULONG_MAX)
		contig_count *= 2;
	ra->start = index;
	ra->size = min(contig_count + req_count, max_pages);
	ra->async_size = 1;
readit:
	ractl->_index = ra->start;
	page_cache_ra_order(ractl, ra, 0);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);


/*
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
static void do_page_cache_ra(struct readahead_control *ractl,
		unsigned long nr_to_read, unsigned long lookahead_size)
{
	struct inode *inode = ractl->mapping->host;
	unsigned long index = readahead_index(ractl);
	loff_t isize = i_size_read(inode);
	pgoff_t end_index;	/* The last page we want to read */

	if (isize == 0)
		return;

	end_index = (isize - 1) >> PAGE_SHIFT;
	if (index > end_index)
		return;
	/* Don't read past the page containing the last byte of the file */
	if (nr_to_read > end_index - index)
		nr_to_read = end_index - index + 1;

	page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
    --> 핵심 함수이다.
}


/**
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
--> low level 함수라는 뜻. 보통의 리눅스 커널 개발자가 이 함수를 사용하거나 고칠 일은 없다.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_ra_unbounded(struct readahead_control *ractl,
		unsigned long nr_to_read, unsigned long lookahead_size)
{
	struct address_space *mapping = ractl->mapping;
	unsigned long ra_folio_index, index = readahead_index(ractl);
	gfp_t gfp_mask = readahead_gfp_mask(mapping);
	unsigned long mark, i = 0;
	unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);

	/*
	 * Partway through the readahead operation, we will have added
	 * locked pages to the page cache, but will not yet have submitted
	 * them for I/O.  Adding another page may need to allocate memory,
	 * which can trigger memory reclaim.  Telling the VM we're in
	 * the middle of a filesystem operation will cause it to not
	 * touch file-backed pages, preventing a deadlock.  Most (all?)
	 * filesystems already specify __GFP_NOFS in their mapping's
	 * gfp_mask, but let's be explicit here.
	 */
	unsigned int nofs = memalloc_nofs_save();

	filemap_invalidate_lock_shared(mapping);
	index = mapping_align_index(mapping, index);

	/*
	 * As iterator `i` is aligned to min_nrpages, round_up the
	 * difference between nr_to_read and lookahead_size to mark the
	 * index that only has lookahead or "async_region" to set the
	 * readahead flag.
	 */
	ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size,
				  min_nrpages);
	mark = ra_folio_index - index;
	nr_to_read += readahead_index(ractl) - index;
	ractl->_index = index;

	/*
	 * Preallocate as many pages as we will need.
	 */
     --> page들을 allocate 한다.
	while (i < nr_to_read) {
		struct folio *folio = xa_load(&mapping->i_pages, index + i);
		int ret;

		if (folio && !xa_is_value(folio)) {
			/*
			 * Page already present?  Kick off the current batch
			 * of contiguous pages before continuing with the
			 * next batch.  This page may be the one we would
			 * have intended to mark as Readahead, but we don't
			 * have a stable reference to this page, and it's
			 * not worth getting one just for that.
			 */
			read_pages(ractl);
			ractl->_index += min_nrpages;
			i = ractl->_index + ractl->_nr_pages - index;
			continue;
		}

		folio = filemap_alloc_folio(gfp_mask,
					    mapping_min_folio_order(mapping));
		if (!folio)
			break;

		ret = filemap_add_folio(mapping, folio, index + i, gfp_mask);
		if (ret < 0) {
			folio_put(folio);
			if (ret == -ENOMEM)
				break;
			read_pages(ractl);
			ractl->_index += min_nrpages;
			i = ractl->_index + ractl->_nr_pages - index;
			continue;
		}
		if (i == mark)
			folio_set_readahead(folio);
		ractl->_workingset |= folio_test_workingset(folio);
		ractl->_nr_pages += min_nrpages;
		i += min_nrpages;
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the folio is not
	 * uptodate then the caller will launch read_folio again, and
	 * will then handle the error.
	 */
	read_pages(ractl);
    --> IO가 여기에서 시작된다.
	filemap_invalidate_unlock_shared(mapping);
	memalloc_nofs_restore(nofs);
}


static void read_pages(struct readahead_control *rac)
{
	const struct address_space_operations *aops = rac->mapping->a_ops;
	struct folio *folio;
	struct blk_plug plug;

	if (!readahead_count(rac))
		return;

	if (unlikely(rac->_workingset))
		psi_memstall_enter(&rac->_pflags);
	blk_start_plug(&plug);

	if (aops->readahead) {
    --> readahead는 address_space_operations의 하나이다. 각 파일시스템에서 이 함수를 구현한다.
    	ext4의 경우 .readahead = ext4_readahead 이다. 여기부터는 각 파일시스템의 영역이다.
		aops->readahead(rac);
		/*
		 * Clean up the remaining folios.  The sizes in ->ra
		 * may be used to size the next readahead, so make sure
		 * they accurately reflect what happened.
		 */
		while ((folio = readahead_folio(rac)) != NULL) {
			unsigned long nr = folio_nr_pages(folio);

			folio_get(folio);
			rac->ra->size -= nr;
			if (rac->ra->async_size >= nr) {
				rac->ra->async_size -= nr;
				filemap_remove_folio(folio);
			}
			folio_unlock(folio);
			folio_put(folio);
		}
	} else {
		while ((folio = readahead_folio(rac)) != NULL)
			aops->read_folio(rac->file, folio);
	}

	blk_finish_plug(&plug);
	if (unlikely(rac->_workingset))
		psi_memstall_leave(&rac->_pflags);
	rac->_workingset = false;

	BUG_ON(readahead_count(rac));
}


static const struct address_space_operations ext4_aops = {
	.read_folio		= ext4_read_folio,
	.readahead		= ext4_readahead,
	.writepages		= ext4_writepages,
	.write_begin		= ext4_write_begin,
	.write_end		= ext4_write_end,
	.dirty_folio		= ext4_dirty_folio,
	.bmap			= ext4_bmap,
	.invalidate_folio	= ext4_invalidate_folio,
	.release_folio		= ext4_release_folio,
	.migrate_folio		= buffer_migrate_folio,
	.is_partially_uptodate  = block_is_partially_uptodate,
	.error_remove_folio	= generic_error_remove_folio,
	.swap_activate		= ext4_iomap_swap_activate,
};

여기까지 read system call 부터 각 파일시스템에 도달하기까지의 흐름을 살펴봤다.

read system call은 ext4 파일시스템의 ext4_file_read_iter()를 부르며, storage에 direct access 하는 것이 아닌 경우 generic_file_read_iter()를 타서 page cache가 존재하는지 (storage의 데이터가 메인메모리(DRAM)에 캐싱되어 있는지) 먼저 살펴본다.

page cache가 있다면 page cache로부터 읽는다.

page cache가 없다면 page_cache_sync_readahead()를 이용하여 storage에서 읽어온다. (앞으로 이후 순차적인 데이터를 storage data에서 더 읽어올 것으로 예상되면, filemap_readahead()가 동작해서 async하게 storage에서 데이터를 더 읽어온다.)

page_cache_sync_readahead()는 최종적으로 aops->readahead 함수를 부르게 되고, 이는 각 파일시스템의 readahead 함수를 부르게 한다. ext4 파일시스템의 경우 ext4_readahead가 불리게 된다.

Read System Call 흐름 정리

이 블로그 검색

Linux Kernel Study

리눅스 커널 파일 시스템 [3] 코드 분석 - page_cache_sync_ra()

리눅스 커널 파일 시스템 Read [1]
리눅스 커널 파일 시스템 Read [2]
리눅스 커널 파일 시스템 Read [3]
리눅스 커널 파일 시스템 Read [4]

댓글

댓글 쓰기

리눅스 커널 파일 시스템 [3] 코드 분석 - page_cache_sync_ra()

리눅스 커널 파일 시스템 Read [1]리눅스 커널 파일 시스템 Read [2]리눅스 커널 파일 시스템 Read [3]리눅스 커널 파일 시스템 Read [4]

댓글

댓글 쓰기

리눅스 커널 파일 시스템 Read [1]
리눅스 커널 파일 시스템 Read [2]
리눅스 커널 파일 시스템 Read [3]
리눅스 커널 파일 시스템 Read [4]