msm: kgsl: Optimize page_alloc allocations

User memory needs to be zeroed out before it is sent to the user. To do this, the kernel maps the page, memsets it to zero and then unmaps it. By virtue of mapping it, this forces us to flush the dcache to ensure cache coherency between kernel and user mappings. Originally, the page_alloc loop was using GFP_ZERO (which does a map, memset, and unmap for each individual page) and then we were additionally calling flush_dcache_page() for each page killing us on performance. It is far more efficient, especially for large allocations (> 1MB), to allocate the pages without GFP_ZERO and then to vmap the entire allocation, memset it to zero, flush the cache and then unmap. This process is slightly slower for very small allocations, but only by a few microseconds, and is well within the margin of acceptability. In all, the new scheme is faster than the default for all sizes greater than 16k, and is almost 4X faster for 2MB and 4MB allocations which are common for textures and very large buffer objects. The downside is that if there isn't enough vmalloc room for the allocation that we are forced to fallback to a slow page by page memset/flush, but this should happen rarely (if at all) and is only included for completeness. CRs-Fixed: 372638 Change-Id: Ic0dedbadf3e27dcddf0f068594a40c00d64b495e Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
2024-11-06 23:17:41 +00:00 · 2012-07-02 17:50:15 -06:00 · 2012-07-02 17:50:15 -06:00 · b0b0e71a98
commit b0b0e71a98
parent 35c715061e
1 changed files with 77 additions and 7 deletions
--- a/drivers/gpu/msm/kgsl_sharedmem.c
+++ b/drivers/gpu/msm/kgsl_sharedmem.c
@ -17,6 +17,7 @@
 #include <asm/cacheflush.h>
 #include <linux/slab.h>
 #include <linux/kmemleak.h>
+#include <linux/highmem.h>

 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
@ -489,9 +490,11 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 			struct kgsl_pagetable *pagetable,
 			size_t size, unsigned int protflags)
 {
-	int order, ret = 0;
+	int i, order, ret = 0;
 	int sglen = PAGE_ALIGN(size) / PAGE_SIZE;
-	int i;
+	struct page **pages = NULL;
+	pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
+	void *ptr;

 	/*
 	 * Add guard page to the end of the allocation when the
@ -515,26 +518,53 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 		goto done;
 	}

+	/*
+	 * Allocate space to store the list of pages to send to vmap.
+	 * This is an array of pointers so we can track 1024 pages per page of
+	 * allocation which means we can handle up to a 8MB buffer request with
+	 * two pages; well within the acceptable limits for using kmalloc.
+	 */
+
+	pages = kmalloc(sglen * sizeof(struct page *), GFP_KERNEL);
+
+	if (pages == NULL) {
+		KGSL_CORE_ERR("kmalloc (%d) failed\n",
+			sglen * sizeof(struct page *));
+		ret = -ENOMEM;
+		goto done;
+	}
+
 	kmemleak_not_leak(memdesc->sg);

 	memdesc->sglen = sglen;
 	sg_init_table(memdesc->sg, sglen);

 	for (i = 0; i < PAGE_ALIGN(size) / PAGE_SIZE; i++) {
-		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO |
-						__GFP_HIGHMEM);
-		if (!page) {
+
+		/*
+		 * Don't use GFP_ZERO here because it is faster to memset the
+		 * range ourselves (see below)
+		 */
+
+		pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (pages[i] == NULL) {
 			ret = -ENOMEM;
 			memdesc->sglen = i;
 			goto done;
 		}
-		flush_dcache_page(page);
-		sg_set_page(&memdesc->sg[i], page, PAGE_SIZE, 0);
+
+		sg_set_page(&memdesc->sg[i], pages[i], PAGE_SIZE, 0);
 	}

 	/* ADd the guard page to the end of the sglist */

 	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_IOMMU) {
+		/*
+		 * It doesn't matter if we use GFP_ZERO here, this never
+		 * gets mapped, and we only allocate it once in the life
+		 * of the system
+		 */
+
 		if (kgsl_guard_page == NULL)
 			kgsl_guard_page = alloc_page(GFP_KERNEL | __GFP_ZERO |
 				__GFP_HIGHMEM);
@ -547,6 +577,44 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 			memdesc->sglen--;
 	}

+	/*
+	 * All memory that goes to the user has to be zeroed out before it gets
+	 * exposed to userspace. This means that the memory has to be mapped in
+	 * the kernel, zeroed (memset) and then unmapped.  This also means that
+	 * the dcache has to be flushed to ensure coherency between the kernel
+	 * and user pages. We used to pass __GFP_ZERO to alloc_page which mapped
+	 * zeroed and unmaped each individual page, and then we had to turn
+	 * around and call flush_dcache_page() on that page to clear the caches.
+	 * This was killing us for performance. Instead, we found it is much
+	 * faster to allocate the pages without GFP_ZERO, map the entire range,
+	 * memset it, flush the range and then unmap - this results in a factor
+	 * of 4 improvement for speed for large buffers.  There is a small
+	 * increase in speed for small buffers, but only on the order of a few
+	 * microseconds at best.  The only downside is that there needs to be
+	 * enough temporary space in vmalloc to accomodate the map. This
+	 * shouldn't be a problem, but if it happens, fall back to a much slower
+	 * path
+	 */
+
+	ptr = vmap(pages, i, VM_IOREMAP, page_prot);
+
+	if (ptr != NULL) {
+		memset(ptr, 0, memdesc->size);
+		dmac_flush_range(ptr, ptr + memdesc->size);
+		vunmap(ptr);
+	} else {
+		int j;
+
+		/* Very, very, very slow path */
+
+		for (j = 0; j < i; j++) {
+			ptr = kmap_atomic(pages[j]);
+			memset(ptr, 0, PAGE_SIZE);
+			dmac_flush_range(ptr, ptr + PAGE_SIZE);
+			kunmap_atomic(ptr);
+		}
+	}
+
 	outer_cache_range_op_sg(memdesc->sg, memdesc->sglen,
 				KGSL_CACHE_OP_FLUSH);

@ -564,6 +632,8 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 		kgsl_driver.stats.histogram[order]++;

 done:
+	kfree(pages);
+
 	if (ret)
 		kgsl_sharedmem_free(memdesc);