RDS: Use page_remainder_alloc() for recv bufs

Instead of splitting up a page into RDS_FRAG_SIZE chunks
ourselves, ask rds_page_remainder_alloc() to do it. While it
is possible PAGE_SIZE > FRAG_SIZE, on x86en it isn't, so having
duplicate "carve up a page into buffers" code seems excessive.

The other modification this spawns is the use of a single
struct scatterlist in rds_page_frag instead of a bare page ptr.
This causes verbosity to increase in some places, and decrease
in others.

Finally, I decided to unify the lifetimes and alloc/free of
rds_page_frag and its page. This is a nice simplification in itself,
but will be extra-nice once we come to adding cmason's recycling
patch.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
This commit is contained in:
Andy Grover 2010-05-24 20:12:41 -07:00
parent fc19de38be
commit 0b088e003c
3 changed files with 29 additions and 73 deletions

View File

@ -28,13 +28,9 @@ extern struct list_head rds_ib_devices;
* try and minimize the amount of memory tied up both the device and * try and minimize the amount of memory tied up both the device and
* socket receive queues. * socket receive queues.
*/ */
/* page offset of the final full frag that fits in the page */
#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
struct rds_page_frag { struct rds_page_frag {
struct list_head f_item; struct list_head f_item;
struct page *f_page; struct scatterlist f_sg;
unsigned long f_offset;
dma_addr_t f_mapped;
}; };
struct rds_ib_incoming { struct rds_ib_incoming {
@ -107,7 +103,6 @@ struct rds_ib_connection {
struct rds_header *i_recv_hdrs; struct rds_header *i_recv_hdrs;
u64 i_recv_hdrs_dma; u64 i_recv_hdrs_dma;
struct rds_ib_recv_work *i_recvs; struct rds_ib_recv_work *i_recvs;
struct rds_page_frag i_frag;
u64 i_ack_recv; /* last ACK received */ u64 i_ack_recv; /* last ACK received */
/* sending acks */ /* sending acks */

View File

@ -43,17 +43,11 @@ static struct kmem_cache *rds_ib_incoming_slab;
static struct kmem_cache *rds_ib_frag_slab; static struct kmem_cache *rds_ib_frag_slab;
static atomic_t rds_ib_allocation = ATOMIC_INIT(0); static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
static void rds_ib_frag_drop_page(struct rds_page_frag *frag) /* Free frag and attached recv buffer f_sg */
{
rdsdebug("frag %p page %p\n", frag, frag->f_page);
__free_page(frag->f_page);
frag->f_page = NULL;
}
static void rds_ib_frag_free(struct rds_page_frag *frag) static void rds_ib_frag_free(struct rds_page_frag *frag)
{ {
rdsdebug("frag %p page %p\n", frag, frag->f_page); rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
BUG_ON(frag->f_page); __free_page(sg_page(&frag->f_sg));
kmem_cache_free(rds_ib_frag_slab, frag); kmem_cache_free(rds_ib_frag_slab, frag);
} }
@ -71,12 +65,8 @@ static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
{ {
struct rds_page_frag *frag = recv->r_frag; struct rds_page_frag *frag = recv->r_frag;
rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); rdsdebug("recv %p frag %p page %p\n", recv, frag, sg_page(&frag->f_sg));
if (frag->f_mapped) ib_dma_unmap_sg(ic->i_cm_id->device, &frag->f_sg, 1, DMA_FROM_DEVICE);
ib_dma_unmap_page(ic->i_cm_id->device,
frag->f_mapped,
RDS_FRAG_SIZE, DMA_FROM_DEVICE);
frag->f_mapped = 0;
} }
void rds_ib_recv_init_ring(struct rds_ib_connection *ic) void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
@ -116,8 +106,6 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
} }
if (recv->r_frag) { if (recv->r_frag) {
rds_ib_recv_unmap_page(ic, recv); rds_ib_recv_unmap_page(ic, recv);
if (recv->r_frag->f_page)
rds_ib_frag_drop_page(recv->r_frag);
rds_ib_frag_free(recv->r_frag); rds_ib_frag_free(recv->r_frag);
recv->r_frag = NULL; recv->r_frag = NULL;
} }
@ -129,16 +117,12 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
for (i = 0; i < ic->i_recv_ring.w_nr; i++) for (i = 0; i < ic->i_recv_ring.w_nr; i++)
rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
if (ic->i_frag.f_page)
rds_ib_frag_drop_page(&ic->i_frag);
} }
static int rds_ib_recv_refill_one(struct rds_connection *conn, static int rds_ib_recv_refill_one(struct rds_connection *conn,
struct rds_ib_recv_work *recv) struct rds_ib_recv_work *recv)
{ {
struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_connection *ic = conn->c_transport_data;
dma_addr_t dma_addr;
struct ib_sge *sge; struct ib_sge *sge;
int ret = -ENOMEM; int ret = -ENOMEM;
@ -161,50 +145,27 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
if (!recv->r_frag) if (!recv->r_frag)
goto out; goto out;
INIT_LIST_HEAD(&recv->r_frag->f_item); INIT_LIST_HEAD(&recv->r_frag->f_item);
recv->r_frag->f_page = NULL; sg_init_table(&recv->r_frag->f_sg, 1);
} ret = rds_page_remainder_alloc(&recv->r_frag->f_sg,
RDS_FRAG_SIZE, GFP_NOWAIT);
if (!ic->i_frag.f_page) { if (ret) {
ic->i_frag.f_page = alloc_page(GFP_NOWAIT); kmem_cache_free(rds_ib_frag_slab, recv->r_frag);
if (!ic->i_frag.f_page) recv->r_frag = NULL;
goto out; goto out;
ic->i_frag.f_offset = 0; }
} }
dma_addr = ib_dma_map_page(ic->i_cm_id->device, ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
ic->i_frag.f_page, 1, DMA_FROM_DEVICE);
ic->i_frag.f_offset, WARN_ON(ret != 1);
RDS_FRAG_SIZE,
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
goto out;
/*
* Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
* must be called on this recv. This happens as completions hit
* in order or on connection shutdown.
*/
recv->r_frag->f_page = ic->i_frag.f_page;
recv->r_frag->f_offset = ic->i_frag.f_offset;
recv->r_frag->f_mapped = dma_addr;
sge = &recv->r_sge[0]; sge = &recv->r_sge[0];
sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
sge->length = sizeof(struct rds_header); sge->length = sizeof(struct rds_header);
sge = &recv->r_sge[1]; sge = &recv->r_sge[1];
sge->addr = dma_addr; sge->addr = sg_dma_address(&recv->r_frag->f_sg);
sge->length = RDS_FRAG_SIZE; sge->length = sg_dma_len(&recv->r_frag->f_sg);
get_page(recv->r_frag->f_page);
if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
ic->i_frag.f_offset += RDS_FRAG_SIZE;
} else {
put_page(ic->i_frag.f_page);
ic->i_frag.f_page = NULL;
ic->i_frag.f_offset = 0;
}
ret = 0; ret = 0;
out: out:
@ -247,8 +208,8 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
/* XXX when can this fail? */ /* XXX when can this fail? */
ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
recv->r_ibinc, recv->r_frag->f_page, recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
(long) recv->r_frag->f_mapped, ret); (long) sg_dma_address(&recv->r_frag->f_sg), ret);
if (ret) { if (ret) {
rds_ib_conn_error(conn, "recv post on " rds_ib_conn_error(conn, "recv post on "
"%pI4 returned %d, disconnecting and " "%pI4 returned %d, disconnecting and "
@ -281,7 +242,6 @@ static void rds_ib_inc_purge(struct rds_incoming *inc)
list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
list_del_init(&frag->f_item); list_del_init(&frag->f_item);
rds_ib_frag_drop_page(frag);
rds_ib_frag_free(frag); rds_ib_frag_free(frag);
} }
} }
@ -333,13 +293,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
to_copy = min_t(unsigned long, to_copy, len - copied); to_copy = min_t(unsigned long, to_copy, len - copied);
rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
"[%p, %lu] + %lu\n", "[%p, %u] + %lu\n",
to_copy, iov->iov_base, iov->iov_len, iov_off, to_copy, iov->iov_base, iov->iov_len, iov_off,
frag->f_page, frag->f_offset, frag_off); sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
/* XXX needs + offset for multiple recvs per page */ /* XXX needs + offset for multiple recvs per page */
ret = rds_page_copy_to_user(frag->f_page, ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
frag->f_offset + frag_off, frag->f_sg.offset + frag_off,
iov->iov_base + iov_off, iov->iov_base + iov_off,
to_copy); to_copy);
if (ret) { if (ret) {
@ -595,7 +555,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
src = addr + frag_off; src = addr + frag_off;
dst = (void *)map->m_page_addrs[map_page] + map_off; dst = (void *)map->m_page_addrs[map_page] + map_off;
@ -698,12 +658,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
* the inc is freed. We don't go that route, so we have to drop the * the inc is freed. We don't go that route, so we have to drop the
* page ref ourselves. We can't just leave the page on the recv * page ref ourselves. We can't just leave the page on the recv
* because that confuses the dma mapping of pages and each recv's use * because that confuses the dma mapping of pages and each recv's use
* of a partial page. We can leave the frag, though, it will be * of a partial page.
* reused.
* *
* FIXME: Fold this into the code path below. * FIXME: Fold this into the code path below.
*/ */
rds_ib_frag_drop_page(recv->r_frag); rds_ib_frag_free(recv->r_frag);
recv->r_frag = NULL;
return; return;
} }

View File

@ -186,6 +186,7 @@ out:
ret ? 0 : scat->length); ret ? 0 : scat->length);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
static int rds_page_remainder_cpu_notify(struct notifier_block *self, static int rds_page_remainder_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu) unsigned long action, void *hcpu)