skbuff.c 123 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3
/*
 *	Routines having to do with the 'struct sk_buff' memory handlers.
 *
4
 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
 *			Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *	Fixes:
 *		Alan Cox	:	Fixed the worst of the load
 *					balancer bugs.
 *		Dave Platt	:	Interrupt stacking fix.
 *	Richard Kooijman	:	Timestamp fixes.
 *		Alan Cox	:	Changed buffer format.
 *		Alan Cox	:	destructor hook for AF_UNIX etc.
 *		Linus Torvalds	:	Better skb_clone.
 *		Alan Cox	:	Added skb_copy.
 *		Alan Cox	:	Added all the changed routines Linus
 *					only put in the headers
 *		Ray VanTassle	:	Fixed --skb->lock in free
 *		Alan Cox	:	skb_copy copy arp field
 *		Andi Kleen	:	slabified it.
 *		Robert Olsson	:	Removed skb_head_pool
 *
 *	NOTE:
 *		The __skb_ routines should be called with interrupts
 *	disabled, or you better be *real* sure that the operation is atomic
 *	with respect to whatever list is being frobbed (e.g. via lock_sock()
 *	or via disabling bottom half handlers, etc).
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 */

/*
 *	The functions in this file will not compile correctly with gcc 2.4.x
 */

Joe Perches's avatar
Joe Perches committed
39 40
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

Linus Torvalds's avatar
Linus Torvalds committed
41 42 43
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
44
#include <linux/kmemcheck.h>
Linus Torvalds's avatar
Linus Torvalds committed
45 46 47 48 49
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
50 51
#include <linux/tcp.h>
#include <linux/udp.h>
52
#include <linux/sctp.h>
Linus Torvalds's avatar
Linus Torvalds committed
53 54 55 56 57 58
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
Jens Axboe's avatar
Jens Axboe committed
59
#include <linux/splice.h>
Linus Torvalds's avatar
Linus Torvalds committed
60 61 62
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
63
#include <linux/scatterlist.h>
64
#include <linux/errqueue.h>
65
#include <linux/prefetch.h>
66
#include <linux/if_vlan.h>
Linus Torvalds's avatar
Linus Torvalds committed
67 68 69 70 71

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
Paul Durrant's avatar
Paul Durrant committed
72
#include <net/ip6_checksum.h>
Linus Torvalds's avatar
Linus Torvalds committed
73 74
#include <net/xfrm.h>

75
#include <linux/uaccess.h>
76
#include <trace/events/skb.h>
Eric Dumazet's avatar
Eric Dumazet committed
77
#include <linux/highmem.h>
78 79
#include <linux/capability.h>
#include <linux/user_namespace.h>
80

81
struct kmem_cache *skbuff_head_cache __read_mostly;
82
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
83 84
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
Linus Torvalds's avatar
Linus Torvalds committed
85 86

/**
87 88 89 90
 *	skb_panic - private function for out-of-line support
 *	@skb:	buffer
 *	@sz:	size
 *	@addr:	address
91
 *	@msg:	skb_over_panic or skb_under_panic
Linus Torvalds's avatar
Linus Torvalds committed
92
 *
93 94 95 96
 *	Out-of-line support for skb_put() and skb_push().
 *	Called via the wrapper skb_over_panic() or skb_under_panic().
 *	Keep out of line to prevent kernel bloat.
 *	__builtin_return_address is not used because it is not always reliable.
Linus Torvalds's avatar
Linus Torvalds committed
97
 */
98
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99
		      const char msg[])
Linus Torvalds's avatar
Linus Torvalds committed
100
{
Joe Perches's avatar
Joe Perches committed
101
	pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
102
		 msg, addr, skb->len, sz, skb->head, skb->data,
Joe Perches's avatar
Joe Perches committed
103 104
		 (unsigned long)skb->tail, (unsigned long)skb->end,
		 skb->dev ? skb->dev->name : "<NULL>");
Linus Torvalds's avatar
Linus Torvalds committed
105 106 107
	BUG();
}

108
static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
Linus Torvalds's avatar
Linus Torvalds committed
109
{
110
	skb_panic(skb, sz, addr, __func__);
Linus Torvalds's avatar
Linus Torvalds committed
111 112
}

113 114 115 116
static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
	skb_panic(skb, sz, addr, __func__);
}
117 118 119 120 121 122 123 124 125 126

/*
 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 * the caller if emergency pfmemalloc reserves are being used. If it is and
 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
127 128 129

static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
			       unsigned long ip, bool *pfmemalloc)
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
{
	void *obj;
	bool ret_pfmemalloc = false;

	/*
	 * Try a regular allocation, when that fails and we're not entitled
	 * to the reserves, fail.
	 */
	obj = kmalloc_node_track_caller(size,
					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
					node);
	if (obj || !(gfp_pfmemalloc_allowed(flags)))
		goto out;

	/* Try again but now we are using pfmemalloc reserves */
	ret_pfmemalloc = true;
	obj = kmalloc_node_track_caller(size, flags, node);

out:
	if (pfmemalloc)
		*pfmemalloc = ret_pfmemalloc;

	return obj;
}

Linus Torvalds's avatar
Linus Torvalds committed
155 156 157 158 159 160
/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
 *	'private' fields and also do memory statistics to find all the
 *	[BEEP] leaks.
 *
 */

161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
{
	struct sk_buff *skb;

	/* Get the HEAD */
	skb = kmem_cache_alloc_node(skbuff_head_cache,
				    gfp_mask & ~__GFP_DMA, node);
	if (!skb)
		goto out;

	/*
	 * Only clear those fields we need to clear, not those that we will
	 * actually initialise below. Hence, don't put any more fields after
	 * the tail pointer in struct sk_buff!
	 */
	memset(skb, 0, offsetof(struct sk_buff, tail));
177
	skb->head = NULL;
178 179 180
	skb->truesize = sizeof(struct sk_buff);
	atomic_set(&skb->users, 1);

Cong Wang's avatar
Cong Wang committed
181
	skb->mac_header = (typeof(skb->mac_header))~0U;
182 183 184 185
out:
	return skb;
}

Linus Torvalds's avatar
Linus Torvalds committed
186
/**
187
 *	__alloc_skb	-	allocate a network buffer
Linus Torvalds's avatar
Linus Torvalds committed
188 189
 *	@size: size to allocate
 *	@gfp_mask: allocation mask
190 191 192 193
 *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 *		instead of head cache and allocate a cloned (child) skb.
 *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 *		allocations in case the data is required for writeback
194
 *	@node: numa node to allocate memory on
Linus Torvalds's avatar
Linus Torvalds committed
195 196
 *
 *	Allocate a new &sk_buff. The returned buffer has no headroom and a
197 198
 *	tail room of at least size bytes. The object has a reference count
 *	of one. The return is the buffer. On a failure the return is %NULL.
Linus Torvalds's avatar
Linus Torvalds committed
199 200 201 202
 *
 *	Buffers may only be allocated from interrupts using a @gfp_mask of
 *	%GFP_ATOMIC.
 */
203
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
204
			    int flags, int node)
Linus Torvalds's avatar
Linus Torvalds committed
205
{
206
	struct kmem_cache *cache;
207
	struct skb_shared_info *shinfo;
Linus Torvalds's avatar
Linus Torvalds committed
208 209
	struct sk_buff *skb;
	u8 *data;
210
	bool pfmemalloc;
Linus Torvalds's avatar
Linus Torvalds committed
211

212 213 214 215 216
	cache = (flags & SKB_ALLOC_FCLONE)
		? skbuff_fclone_cache : skbuff_head_cache;

	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
		gfp_mask |= __GFP_MEMALLOC;
217

Linus Torvalds's avatar
Linus Torvalds committed
218
	/* Get the HEAD */
219
	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
Linus Torvalds's avatar
Linus Torvalds committed
220 221
	if (!skb)
		goto out;
Eric Dumazet's avatar
Eric Dumazet committed
222
	prefetchw(skb);
Linus Torvalds's avatar
Linus Torvalds committed
223

Eric Dumazet's avatar
Eric Dumazet committed
224 225 226 227 228
	/* We do our best to align skb_shared_info on a separate cache
	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
	 * Both skb->head and skb_shared_info are cache line aligned.
	 */
229
	size = SKB_DATA_ALIGN(size);
Eric Dumazet's avatar
Eric Dumazet committed
230
	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
231
	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
Linus Torvalds's avatar
Linus Torvalds committed
232 233
	if (!data)
		goto nodata;
Eric Dumazet's avatar
Eric Dumazet committed
234 235 236 237 238
	/* kmalloc(size) might give us more room than requested.
	 * Put skb_shared_info exactly at the end of allocated zone,
	 * to allow max possible filling before reallocation.
	 */
	size = SKB_WITH_OVERHEAD(ksize(data));
Eric Dumazet's avatar
Eric Dumazet committed
239
	prefetchw(data + size);
Linus Torvalds's avatar
Linus Torvalds committed
240

241
	/*
242 243 244
	 * Only clear those fields we need to clear, not those that we will
	 * actually initialise below. Hence, don't put any more fields after
	 * the tail pointer in struct sk_buff!
245 246
	 */
	memset(skb, 0, offsetof(struct sk_buff, tail));
Eric Dumazet's avatar
Eric Dumazet committed
247 248
	/* Account for allocated memory : skb + skb->head */
	skb->truesize = SKB_TRUESIZE(size);
249
	skb->pfmemalloc = pfmemalloc;
Linus Torvalds's avatar
Linus Torvalds committed
250 251 252
	atomic_set(&skb->users, 1);
	skb->head = data;
	skb->data = data;
253
	skb_reset_tail_pointer(skb);
254
	skb->end = skb->tail + size;
Cong Wang's avatar
Cong Wang committed
255 256
	skb->mac_header = (typeof(skb->mac_header))~0U;
	skb->transport_header = (typeof(skb->transport_header))~0U;
257

258 259
	/* make sure we initialize shinfo sequentially */
	shinfo = skb_shinfo(skb);
Eric Dumazet's avatar
Eric Dumazet committed
260
	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
261
	atomic_set(&shinfo->dataref, 1);
262
	kmemcheck_annotate_variable(shinfo->destructor_arg);
263

264
	if (flags & SKB_ALLOC_FCLONE) {
265
		struct sk_buff_fclones *fclones;
Linus Torvalds's avatar
Linus Torvalds committed
266

267 268 269
		fclones = container_of(skb, struct sk_buff_fclones, skb1);

		kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
270
		skb->fclone = SKB_FCLONE_ORIG;
271
		atomic_set(&fclones->fclone_ref, 1);
272

273
		fclones->skb2.fclone = SKB_FCLONE_CLONE;
274
	}
Linus Torvalds's avatar
Linus Torvalds committed
275 276 277
out:
	return skb;
nodata:
278
	kmem_cache_free(cache, skb);
Linus Torvalds's avatar
Linus Torvalds committed
279 280 281
	skb = NULL;
	goto out;
}
282
EXPORT_SYMBOL(__alloc_skb);
Linus Torvalds's avatar
Linus Torvalds committed
283

Eric Dumazet's avatar
Eric Dumazet committed
284
/**
Eric Dumazet's avatar
Eric Dumazet committed
285
 * __build_skb - build a network buffer
Eric Dumazet's avatar
Eric Dumazet committed
286
 * @data: data buffer provided by caller
Eric Dumazet's avatar
Eric Dumazet committed
287
 * @frag_size: size of data, or 0 if head was kmalloced
Eric Dumazet's avatar
Eric Dumazet committed
288 289
 *
 * Allocate a new &sk_buff. Caller provides space holding head and
290
 * skb_shared_info. @data must have been allocated by kmalloc() only if
Eric Dumazet's avatar
Eric Dumazet committed
291 292
 * @frag_size is 0, otherwise data should come from the page allocator
 *  or vmalloc()
Eric Dumazet's avatar
Eric Dumazet committed
293 294 295 296 297 298 299 300 301 302
 * The return is the new skb buffer.
 * On a failure the return is %NULL, and @data is not freed.
 * Notes :
 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 *  Driver should add room at head (NET_SKB_PAD) and
 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 *  before giving packet to stack.
 *  RX rings only contains data buffers, not full skbs.
 */
Eric Dumazet's avatar
Eric Dumazet committed
303
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
Eric Dumazet's avatar
Eric Dumazet committed
304 305 306
{
	struct skb_shared_info *shinfo;
	struct sk_buff *skb;
307
	unsigned int size = frag_size ? : ksize(data);
Eric Dumazet's avatar
Eric Dumazet committed
308 309 310 311 312

	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
	if (!skb)
		return NULL;

313
	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
Eric Dumazet's avatar
Eric Dumazet committed
314 315 316 317 318 319 320 321

	memset(skb, 0, offsetof(struct sk_buff, tail));
	skb->truesize = SKB_TRUESIZE(size);
	atomic_set(&skb->users, 1);
	skb->head = data;
	skb->data = data;
	skb_reset_tail_pointer(skb);
	skb->end = skb->tail + size;
Cong Wang's avatar
Cong Wang committed
322 323
	skb->mac_header = (typeof(skb->mac_header))~0U;
	skb->transport_header = (typeof(skb->transport_header))~0U;
Eric Dumazet's avatar
Eric Dumazet committed
324 325 326 327 328 329 330 331 332

	/* make sure we initialize shinfo sequentially */
	shinfo = skb_shinfo(skb);
	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
	atomic_set(&shinfo->dataref, 1);
	kmemcheck_annotate_variable(shinfo->destructor_arg);

	return skb;
}
Eric Dumazet's avatar
Eric Dumazet committed
333 334 335 336 337 338 339 340 341 342 343 344

/* build_skb() is wrapper over __build_skb(), that specifically
 * takes care of skb->head and skb->pfmemalloc
 * This means that if @frag_size is not zero, then @data must be backed
 * by a page fragment, not kmalloc() or vmalloc()
 */
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
	struct sk_buff *skb = __build_skb(data, frag_size);

	if (skb && frag_size) {
		skb->head_frag = 1;
345
		if (page_is_pfmemalloc(virt_to_head_page(data)))
Eric Dumazet's avatar
Eric Dumazet committed
346 347 348 349
			skb->pfmemalloc = 1;
	}
	return skb;
}
Eric Dumazet's avatar
Eric Dumazet committed
350 351
EXPORT_SYMBOL(build_skb);

352 353 354 355
#define NAPI_SKB_CACHE_SIZE	64

struct napi_alloc_cache {
	struct page_frag_cache page;
356
	unsigned int skb_count;
357 358 359
	void *skb_cache[NAPI_SKB_CACHE_SIZE];
};

360
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
361
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
362 363 364

static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
365
	struct page_frag_cache *nc;
366 367 368 369
	unsigned long flags;
	void *data;

	local_irq_save(flags);
370
	nc = this_cpu_ptr(&netdev_alloc_cache);
371
	data = page_frag_alloc(nc, fragsz, gfp_mask);
372 373 374
	local_irq_restore(flags);
	return data;
}
375 376 377 378 379 380 381 382 383 384 385 386

/**
 * netdev_alloc_frag - allocate a page fragment
 * @fragsz: fragment size
 *
 * Allocates a frag from a page for receive buffer.
 * Uses GFP_ATOMIC allocations.
 */
void *netdev_alloc_frag(unsigned int fragsz)
{
	return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
}
387 388
EXPORT_SYMBOL(netdev_alloc_frag);

389 390
static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
391
	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
392

393
	return page_frag_alloc(&nc->page, fragsz, gfp_mask);
394 395 396 397 398 399 400 401
}

void *napi_alloc_frag(unsigned int fragsz)
{
	return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
}
EXPORT_SYMBOL(napi_alloc_frag);

402 403 404
/**
 *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *	@dev: network device to receive on
405
 *	@len: length to allocate
406 407 408 409 410 411 412 413 414
 *	@gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *	Allocate a new &sk_buff and assign it a usage count of one. The
 *	buffer has NET_SKB_PAD headroom built in. Users should allocate
 *	the headroom they think they need without accounting for the
 *	built in space. The built in space is used for optimisations.
 *
 *	%NULL is returned if there is no free memory.
 */
415 416
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
				   gfp_t gfp_mask)
417
{
418
	struct page_frag_cache *nc;
419
	unsigned long flags;
420
	struct sk_buff *skb;
421 422 423 424
	bool pfmemalloc;
	void *data;

	len += NET_SKB_PAD;
425

426
	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
427
	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
428 429 430 431 432
		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
		if (!skb)
			goto skb_fail;
		goto skb_success;
	}
433

434 435 436 437 438 439 440 441 442
	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	len = SKB_DATA_ALIGN(len);

	if (sk_memalloc_socks())
		gfp_mask |= __GFP_MEMALLOC;

	local_irq_save(flags);

	nc = this_cpu_ptr(&netdev_alloc_cache);
443
	data = page_frag_alloc(nc, len, gfp_mask);
444 445 446 447 448 449 450 451 452
	pfmemalloc = nc->pfmemalloc;

	local_irq_restore(flags);

	if (unlikely(!data))
		return NULL;

	skb = __build_skb(data, len);
	if (unlikely(!skb)) {
453
		skb_free_frag(data);
454
		return NULL;
455
	}
456

457 458 459 460 461
	/* use OR instead of assignment to avoid clearing of bits in mask */
	if (pfmemalloc)
		skb->pfmemalloc = 1;
	skb->head_frag = 1;

462
skb_success:
463 464 465
	skb_reserve(skb, NET_SKB_PAD);
	skb->dev = dev;

466
skb_fail:
467 468
	return skb;
}
469
EXPORT_SYMBOL(__netdev_alloc_skb);
Linus Torvalds's avatar
Linus Torvalds committed
470

471 472 473
/**
 *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 *	@napi: napi instance this buffer was allocated for
474
 *	@len: length to allocate
475 476 477 478 479 480 481 482 483
 *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
 *
 *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
 *	attempt to allocate the head from a special reserved region used
 *	only for NAPI Rx allocation.  By doing this we can save several
 *	CPU cycles by avoiding having to disable and re-enable IRQs.
 *
 *	%NULL is returned if there is no free memory.
 */
484 485
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
				 gfp_t gfp_mask)
486
{
487
	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
488
	struct sk_buff *skb;
489 490 491
	void *data;

	len += NET_SKB_PAD + NET_IP_ALIGN;
492

493
	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
494
	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
495 496 497 498 499
		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
		if (!skb)
			goto skb_fail;
		goto skb_success;
	}
500 501 502 503 504 505

	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
	len = SKB_DATA_ALIGN(len);

	if (sk_memalloc_socks())
		gfp_mask |= __GFP_MEMALLOC;
506

507
	data = page_frag_alloc(&nc->page, len, gfp_mask);
508 509 510 511 512
	if (unlikely(!data))
		return NULL;

	skb = __build_skb(data, len);
	if (unlikely(!skb)) {
513
		skb_free_frag(data);
514
		return NULL;
515 516
	}

517
	/* use OR instead of assignment to avoid clearing of bits in mask */
518
	if (nc->page.pfmemalloc)
519 520 521
		skb->pfmemalloc = 1;
	skb->head_frag = 1;

522
skb_success:
523 524 525
	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
	skb->dev = napi->dev;

526
skb_fail:
527 528 529 530
	return skb;
}
EXPORT_SYMBOL(__napi_alloc_skb);

Peter Zijlstra's avatar
Peter Zijlstra committed
531
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
532
		     int size, unsigned int truesize)
Peter Zijlstra's avatar
Peter Zijlstra committed
533 534 535 536
{
	skb_fill_page_desc(skb, i, page, off, size);
	skb->len += size;
	skb->data_len += size;
537
	skb->truesize += truesize;
Peter Zijlstra's avatar
Peter Zijlstra committed
538 539 540
}
EXPORT_SYMBOL(skb_add_rx_frag);

541 542 543 544 545 546 547 548 549 550 551 552
void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
			  unsigned int truesize)
{
	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

	skb_frag_size_add(frag, size);
	skb->len += size;
	skb->data_len += size;
	skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_coalesce_rx_frag);

553
static void skb_drop_list(struct sk_buff **listp)
Linus Torvalds's avatar
Linus Torvalds committed
554
{
Eric Dumazet's avatar
Eric Dumazet committed
555
	kfree_skb_list(*listp);
556
	*listp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
557 558
}

559 560 561 562 563
static inline void skb_drop_fraglist(struct sk_buff *skb)
{
	skb_drop_list(&skb_shinfo(skb)->frag_list);
}

Linus Torvalds's avatar
Linus Torvalds committed
564 565 566 567
static void skb_clone_fraglist(struct sk_buff *skb)
{
	struct sk_buff *list;

568
	skb_walk_frags(skb, list)
Linus Torvalds's avatar
Linus Torvalds committed
569 570 571
		skb_get(list);
}

572 573
static void skb_free_head(struct sk_buff *skb)
{
574 575
	unsigned char *head = skb->head;

576
	if (skb->head_frag)
577
		skb_free_frag(head);
578
	else
579
		kfree(head);
580 581
}

582
static void skb_release_data(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
583
{
584 585
	struct skb_shared_info *shinfo = skb_shinfo(skb);
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
586

587 588 589 590
	if (skb->cloned &&
	    atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
			      &shinfo->dataref))
		return;
591

592 593
	for (i = 0; i < shinfo->nr_frags; i++)
		__skb_frag_unref(&shinfo->frags[i]);
594

595 596 597 598 599 600
	/*
	 * If skb buf is from userspace, we need to notify the caller
	 * the lower device DMA has done;
	 */
	if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
		struct ubuf_info *uarg;
Linus Torvalds's avatar
Linus Torvalds committed
601

602 603 604
		uarg = shinfo->destructor_arg;
		if (uarg->callback)
			uarg->callback(uarg, true);
Linus Torvalds's avatar
Linus Torvalds committed
605
	}
606 607 608 609 610

	if (shinfo->frag_list)
		kfree_skb_list(shinfo->frag_list);

	skb_free_head(skb);
Linus Torvalds's avatar
Linus Torvalds committed
611 612 613 614 615
}

/*
 *	Free an skbuff by memory without cleaning the state.
 */
616
static void kfree_skbmem(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
617
{
618
	struct sk_buff_fclones *fclones;
619 620 621 622

	switch (skb->fclone) {
	case SKB_FCLONE_UNAVAILABLE:
		kmem_cache_free(skbuff_head_cache, skb);
623
		return;
624 625

	case SKB_FCLONE_ORIG:
626
		fclones = container_of(skb, struct sk_buff_fclones, skb1);
627

628 629 630
		/* We usually free the clone (TX completion) before original skb
		 * This test would have no chance to be true for the clone,
		 * while here, branch prediction will be good.
631
		 */
632 633 634
		if (atomic_read(&fclones->fclone_ref) == 1)
			goto fastpath;
		break;
635

636 637
	default: /* SKB_FCLONE_CLONE */
		fclones = container_of(skb, struct sk_buff_fclones, skb2);
638
		break;
639
	}
640 641 642 643
	if (!atomic_dec_and_test(&fclones->fclone_ref))
		return;
fastpath:
	kmem_cache_free(skbuff_fclone_cache, fclones);
Linus Torvalds's avatar
Linus Torvalds committed
644 645
}

646
static void skb_release_head_state(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
647
{
Eric Dumazet's avatar
Eric Dumazet committed
648
	skb_dst_drop(skb);
Linus Torvalds's avatar
Linus Torvalds committed
649 650 651
#ifdef CONFIG_XFRM
	secpath_put(skb->sp);
#endif
652 653
	if (skb->destructor) {
		WARN_ON(in_irq());
Linus Torvalds's avatar
Linus Torvalds committed
654 655
		skb->destructor(skb);
	}
Igor Maravić's avatar
Igor Maravić committed
656
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
657
	nf_conntrack_put(skb_nfct(skb));
658
#endif
659
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
Linus Torvalds's avatar
Linus Torvalds committed
660 661
	nf_bridge_put(skb->nf_bridge);
#endif
662 663 664 665 666 667
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb)
{
	skb_release_head_state(skb);
668
	if (likely(skb->head))
669
		skb_release_data(skb);
670 671 672 673 674 675 676 677 678 679
}

/**
 *	__kfree_skb - private function
 *	@skb: buffer
 *
 *	Free an sk_buff. Release anything attached to the buffer.
 *	Clean the state. This is an internal helper function. Users should
 *	always call kfree_skb
 */
Linus Torvalds's avatar
Linus Torvalds committed
680

681 682 683
void __kfree_skb(struct sk_buff *skb)
{
	skb_release_all(skb);
Linus Torvalds's avatar
Linus Torvalds committed
684 685
	kfree_skbmem(skb);
}
686
EXPORT_SYMBOL(__kfree_skb);
Linus Torvalds's avatar
Linus Torvalds committed
687

688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
/**
 *	kfree_skb - free an sk_buff
 *	@skb: buffer to free
 *
 *	Drop a reference to the buffer and free it if the usage count has
 *	hit zero.
 */
void kfree_skb(struct sk_buff *skb)
{
	if (unlikely(!skb))
		return;
	if (likely(atomic_read(&skb->users) == 1))
		smp_rmb();
	else if (likely(!atomic_dec_and_test(&skb->users)))
		return;
703
	trace_kfree_skb(skb, __builtin_return_address(0));
704 705
	__kfree_skb(skb);
}
706
EXPORT_SYMBOL(kfree_skb);
707

Eric Dumazet's avatar
Eric Dumazet committed
708 709 710 711 712 713 714 715 716 717 718
void kfree_skb_list(struct sk_buff *segs)
{
	while (segs) {
		struct sk_buff *next = segs->next;

		kfree_skb(segs);
		segs = next;
	}
}
EXPORT_SYMBOL(kfree_skb_list);

719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
/**
 *	skb_tx_error - report an sk_buff xmit error
 *	@skb: buffer that triggered an error
 *
 *	Report xmit error if a device callback is tracking this skb.
 *	skb must be freed afterwards.
 */
void skb_tx_error(struct sk_buff *skb)
{
	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
		struct ubuf_info *uarg;

		uarg = skb_shinfo(skb)->destructor_arg;
		if (uarg->callback)
			uarg->callback(uarg, false);
		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
	}
}
EXPORT_SYMBOL(skb_tx_error);

739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
/**
 *	consume_skb - free an skbuff
 *	@skb: buffer to free
 *
 *	Drop a ref to the buffer and free it if the usage count has hit zero
 *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
 *	is being dropped after a failure and notes that
 */
void consume_skb(struct sk_buff *skb)
{
	if (unlikely(!skb))
		return;
	if (likely(atomic_read(&skb->users) == 1))
		smp_rmb();
	else if (likely(!atomic_dec_and_test(&skb->users)))
		return;
755
	trace_consume_skb(skb);
756 757 758 759
	__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);

760 761 762 763 764 765 766 767 768 769 770 771
void __kfree_skb_flush(void)
{
	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

	/* flush skb_cache if containing objects */
	if (nc->skb_count) {
		kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
				     nc->skb_cache);
		nc->skb_count = 0;
	}
}

772
static inline void _kfree_skb_defer(struct sk_buff *skb)
773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
{
	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);

	/* drop skb->head and call any destructors for packet */
	skb_release_all(skb);

	/* record skb to CPU local list */
	nc->skb_cache[nc->skb_count++] = skb;

#ifdef CONFIG_SLUB
	/* SLUB writes into objects when freeing */
	prefetchw(skb);
#endif

	/* flush skb_cache if it is filled */
	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
				     nc->skb_cache);
		nc->skb_count = 0;
	}
}
794 795 796 797
void __kfree_skb_defer(struct sk_buff *skb)
{
	_kfree_skb_defer(skb);
}
798 799 800 801 802 803

void napi_consume_skb(struct sk_buff *skb, int budget)
{
	if (unlikely(!skb))
		return;

804
	/* Zero budget indicate non-NAPI context called us, like netpoll */
805
	if (unlikely(!budget)) {
806
		dev_consume_skb_any(skb);
807 808 809 810 811 812 813 814 815 816 817
		return;
	}

	if (likely(atomic_read(&skb->users) == 1))
		smp_rmb();
	else if (likely(!atomic_dec_and_test(&skb->users)))
		return;
	/* if reaching here SKB is ready to free */
	trace_consume_skb(skb);

	/* if SKB is a clone, don't handle this case */
818
	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
819 820 821 822
		__kfree_skb(skb);
		return;
	}

823
	_kfree_skb_defer(skb);
824 825 826
}
EXPORT_SYMBOL(napi_consume_skb);

827 828 829 830 831 832 833
/* Make sure a field is enclosed inside headers_start/headers_end section */
#define CHECK_SKB_FIELD(field) \
	BUILD_BUG_ON(offsetof(struct sk_buff, field) <		\
		     offsetof(struct sk_buff, headers_start));	\
	BUILD_BUG_ON(offsetof(struct sk_buff, field) >		\
		     offsetof(struct sk_buff, headers_end));	\

834 835 836
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
	new->tstamp		= old->tstamp;
837
	/* We do not copy old->sk */
838
	new->dev		= old->dev;
839
	memcpy(new->cb, old->cb, sizeof(old->cb));
Eric Dumazet's avatar
Eric Dumazet committed
840
	skb_dst_copy(new, old);
841
#ifdef CONFIG_XFRM
842 843
	new->sp			= secpath_get(old->sp);
#endif
844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873
	__nf_copy(new, old, false);

	/* Note : this field could be in headers_start/headers_end section
	 * It is not yet because we do not want to have a 16 bit hole
	 */
	new->queue_mapping = old->queue_mapping;

	memcpy(&new->headers_start, &old->headers_start,
	       offsetof(struct sk_buff, headers_end) -
	       offsetof(struct sk_buff, headers_start));
	CHECK_SKB_FIELD(protocol);
	CHECK_SKB_FIELD(csum);
	CHECK_SKB_FIELD(hash);
	CHECK_SKB_FIELD(priority);
	CHECK_SKB_FIELD(skb_iif);
	CHECK_SKB_FIELD(vlan_proto);
	CHECK_SKB_FIELD(vlan_tci);
	CHECK_SKB_FIELD(transport_header);
	CHECK_SKB_FIELD(network_header);
	CHECK_SKB_FIELD(mac_header);
	CHECK_SKB_FIELD(inner_protocol);
	CHECK_SKB_FIELD(inner_transport_header);
	CHECK_SKB_FIELD(inner_network_header);
	CHECK_SKB_FIELD(inner_mac_header);
	CHECK_SKB_FIELD(mark);
#ifdef CONFIG_NETWORK_SECMARK
	CHECK_SKB_FIELD(secmark);
#endif
#ifdef CONFIG_NET_RX_BUSY_POLL
	CHECK_SKB_FIELD(napi_id);
874
#endif
875 876 877
#ifdef CONFIG_XPS
	CHECK_SKB_FIELD(sender_cpu);
#endif
878
#ifdef CONFIG_NET_SCHED
879
	CHECK_SKB_FIELD(tc_index);
880
#endif
881

882 883
}

884 885 886 887
/*
 * You should not add any new code to this function.  Add it to
 * __copy_skb_header above instead.
 */
Herbert Xu's avatar
Herbert Xu committed
888
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
889 890 891 892 893
{
#define C(x) n->x = skb->x

	n->next = n->prev = NULL;
	n->sk = NULL;
894 895
	__copy_skb_header(n, skb);

Linus Torvalds's avatar
Linus Torvalds committed
896 897
	C(len);
	C(data_len);
898
	C(mac_len);
899
	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
900
	n->cloned = 1;
Linus Torvalds's avatar
Linus Torvalds committed
901 902 903 904
	n->nohdr = 0;
	n->destructor = NULL;
	C(tail);
	C(end);
905
	C(head);
906
	C(head_frag);
907 908 909
	C(data);
	C(truesize);
	atomic_set(&n->users, 1);
Linus Torvalds's avatar
Linus Torvalds committed
910 911 912 913 914

	atomic_inc(&(skb_shinfo(skb)->dataref));
	skb->cloned = 1;

	return n;
Herbert Xu's avatar
Herbert Xu committed
915 916 917 918 919 920 921 922 923 924 925 926 927 928 929
#undef C
}

/**
 *	skb_morph	-	morph one skb into another
 *	@dst: the skb to receive the contents
 *	@src: the skb to supply the contents
 *
 *	This is identical to skb_clone except that the target skb is
 *	supplied by the user.
 *
 *	The target skb is returned upon exit.
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
930
	skb_release_all(dst);
Herbert Xu's avatar
Herbert Xu committed
931 932 933 934
	return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);

935 936
/**
 *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
937 938 939 940 941 942 943 944 945 946 947 948 949 950
 *	@skb: the skb to modify
 *	@gfp_mask: allocation priority
 *
 *	This must be called on SKBTX_DEV_ZEROCOPY skb.
 *	It will copy all frags into kernel and drop the reference
 *	to userspace pages.
 *
 *	If this function is called from an interrupt gfp_mask() must be
 *	%GFP_ATOMIC.
 *
 *	Returns 0 on success or a negative error code on failure
 *	to allocate kernel memory to copy to.
 */
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
951 952 953 954 955 956 957 958 959 960
{
	int i;
	int num_frags = skb_shinfo(skb)->nr_frags;
	struct page *page, *head = NULL;
	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;

	for (i = 0; i < num_frags; i++) {
		u8 *vaddr;
		skb_frag_t *f = &skb_shinfo(skb)->frags[i];

961
		page = alloc_page(gfp_mask);
962 963
		if (!page) {
			while (head) {
964
				struct page *next = (struct page *)page_private(head);
965 966 967 968 969
				put_page(head);
				head = next;
			}
			return -ENOMEM;
		}
Eric Dumazet's avatar
Eric Dumazet committed
970
		vaddr = kmap_atomic(skb_frag_page(f));
971
		memcpy(page_address(page),
972
		       vaddr + f->page_offset, skb_frag_size(f));
Eric Dumazet's avatar
Eric Dumazet committed
973
		kunmap_atomic(vaddr);
974
		set_page_private(page, (unsigned long)head);
975 976 977 978
		head = page;
	}

	/* skb frags release userspace buffers */
979
	for (i = 0; i < num_frags; i++)
980
		skb_frag_unref(skb, i);
981

982
	uarg->callback(uarg, false);
983 984

	/* skb frags point to kernel buffers */
985 986 987
	for (i = num_frags - 1; i >= 0; i--) {
		__skb_fill_page_desc(skb, i, head, 0,
				     skb_shinfo(skb)->frags[i].size);
988
		head = (struct page *)page_private(head);
989
	}
990 991

	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
992 993
	return 0;
}
994
EXPORT_SYMBOL_GPL(skb_copy_ubufs);
995

Herbert Xu's avatar
Herbert Xu committed
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
/**
 *	skb_clone	-	duplicate an sk_buff
 *	@skb: buffer to clone
 *	@gfp_mask: allocation priority
 *
 *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
 *	copies share the same packet data but not structure. The new
 *	buffer has a reference count of 1. If the allocation fails the
 *	function returns %NULL otherwise the new buffer is returned.
 *
 *	If this function is called from an interrupt gfp_mask() must be
 *	%GFP_ATOMIC.
 */

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
1012 1013 1014
	struct sk_buff_fclones *fclones = container_of(skb,
						       struct sk_buff_fclones,
						       skb1);
1015
	struct sk_buff *n;
Herbert Xu's avatar
Herbert Xu committed
1016

1017 1018
	if (skb_orphan_frags(skb, gfp_mask))
		return NULL;
1019

Herbert Xu's avatar
Herbert Xu committed
1020
	if (skb->fclone == SKB_FCLONE_ORIG &&
1021 1022 1023
	    atomic_read(&fclones->fclone_ref) == 1) {
		n = &fclones->skb2;
		atomic_set(&fclones->fclone_ref, 2);
Herbert Xu's avatar
Herbert Xu committed