1/*-
2 * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 *  1. Redistributions of source code must retain the above copyright notice,
9 *     this list of conditions and the following disclaimer.
10 *
11 *  2. Neither the name of Matthew Macy nor the names of its
12 *     contributors may be used to endorse or promote products derived from
13 *     this software without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29#include <stdlib.h>
30__FBSDID("$FreeBSD$");
31
32#ifndef __HAIKU__
33#include "opt_inet.h"
34#include "opt_inet6.h"
35#include "opt_acpi.h"
36#include "opt_sched.h"
37#endif
38
39#include <sys/param.h>
40#include <sys/types.h>
41#include <sys/bus.h>
42#include <sys/eventhandler.h>
43#ifndef __HAIKU__
44#include <sys/jail.h>
45#endif
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/mutex.h>
49#include <sys/sx.h>
50#include <sys/module.h>
51#include <sys/kobj.h>
52#include <sys/rman.h>
53#include <sys/proc.h>
54#include <sys/sbuf.h>
55#include <sys/smp.h>
56#include <sys/socket.h>
57#include <sys/sockio.h>
58#include <sys/sysctl.h>
59#include <sys/syslog.h>
60#include <sys/taskqueue.h>
61#include <sys/limits.h>
62
63#include <net/if.h>
64#include <net/if_var.h>
65#include <net/if_types.h>
66#include <net/if_media.h>
67#include <net/bpf.h>
68#include <net/ethernet.h>
69#include <net/if_vlan_var.h>
70#include <net/mp_ring.h>
71#include <net/vnet.h>
72
73#include <netinet/in.h>
74#ifndef __HAIKU__
75#include <netinet/in_pcb.h>
76#include <netinet/tcp_lro.h>
77#include <netinet/in_systm.h>
78#endif
79#include <netinet/if_ether.h>
80#include <netinet/ip.h>
81#include <netinet/ip6.h>
82#include <netinet/tcp.h>
83#include <netinet/ip_var.h>
84#include <netinet/netdump/netdump.h>
85#ifndef __HAIKU__
86#include <netinet6/ip6_var.h>
87#endif
88
89#include <machine/bus.h>
90#ifndef __HAIKU__
91#include <machine/in_cksum.h>
92#endif
93
94#include <vm/vm.h>
95#include <vm/pmap.h>
96
97#include <dev/led/led.h>
98#include <dev/pci/pcireg.h>
99#include <dev/pci/pcivar.h>
100#ifndef __HAIKU__
101#include <dev/pci/pci_private.h>
102#endif
103
104#include <net/iflib.h>
105#include <net/iflib_private.h>
106
107#include <ifdi_if.h>
108#include <device_if.h>
109
110#ifdef PCI_IOV
111#include <dev/pci/pci_iov.h>
112#endif
113
114#include <sys/bitstring.h>
115/*
116 * enable accounting of every mbuf as it comes in to and goes out of
117 * iflib's software descriptor references
118 */
119#define MEMORY_LOGGING 0
120/*
121 * Enable mbuf vectors for compressing long mbuf chains
122 */
123
124/*
125 * NB:
126 * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
127 *   we prefetch needs to be determined by the time spent in m_free vis a vis
128 *   the cost of a prefetch. This will of course vary based on the workload:
129 *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
130 *        is quite expensive, thus suggesting very little prefetch.
131 *      - small packet forwarding which is just returning a single mbuf to
132 *        UMA will typically be very fast vis a vis the cost of a memory
133 *        access.
134 */
135
136
137/*
138 * File organization:
139 *  - private structures
140 *  - iflib private utility functions
141 *  - ifnet functions
142 *  - vlan registry and other exported functions
143 *  - iflib public core functions
144 *
145 *
146 */
147MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
148
149struct iflib_txq;
150typedef struct iflib_txq *iflib_txq_t;
151struct iflib_rxq;
152typedef struct iflib_rxq *iflib_rxq_t;
153struct iflib_fl;
154typedef struct iflib_fl *iflib_fl_t;
155
156struct iflib_ctx;
157
158static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
159static void iflib_timer(void *arg);
160
161typedef struct iflib_filter_info {
162	driver_filter_t *ifi_filter;
163	void *ifi_filter_arg;
164	struct grouptask *ifi_task;
165	void *ifi_ctx;
166} *iflib_filter_info_t;
167
168struct iflib_ctx {
169	KOBJ_FIELDS;
170	/*
171	 * Pointer to hardware driver's softc
172	 */
173	void *ifc_softc;
174	device_t ifc_dev;
175	if_t ifc_ifp;
176
177#ifndef __HAIKU__
178	cpuset_t ifc_cpus;
179#endif
180	if_shared_ctx_t ifc_sctx;
181	struct if_softc_ctx ifc_softc_ctx;
182
183	struct sx ifc_ctx_sx;
184	struct mtx ifc_state_mtx;
185
186	iflib_txq_t ifc_txqs;
187	iflib_rxq_t ifc_rxqs;
188	uint32_t ifc_if_flags;
189	uint32_t ifc_flags;
190	uint32_t ifc_max_fl_buf_size;
191	uint32_t ifc_rx_mbuf_sz;
192
193	int ifc_link_state;
194	int ifc_watchdog_events;
195	struct cdev *ifc_led_dev;
196	struct resource *ifc_msix_mem;
197
198	struct if_irq ifc_legacy_irq;
199	struct grouptask ifc_admin_task;
200	struct grouptask ifc_vflr_task;
201	struct iflib_filter_info ifc_filter_info;
202	struct ifmedia	ifc_media;
203	struct ifmedia	*ifc_mediap;
204
205	struct sysctl_oid *ifc_sysctl_node;
206	uint16_t ifc_sysctl_ntxqs;
207	uint16_t ifc_sysctl_nrxqs;
208	uint16_t ifc_sysctl_qs_eq_override;
209	uint16_t ifc_sysctl_rx_budget;
210	uint16_t ifc_sysctl_tx_abdicate;
211	uint16_t ifc_sysctl_core_offset;
212#define	CORE_OFFSET_UNSPECIFIED	0xffff
213	uint8_t  ifc_sysctl_separate_txrx;
214
215	qidx_t ifc_sysctl_ntxds[8];
216	qidx_t ifc_sysctl_nrxds[8];
217	struct if_txrx ifc_txrx;
218#define isc_txd_encap  ifc_txrx.ift_txd_encap
219#define isc_txd_flush  ifc_txrx.ift_txd_flush
220#define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
221#define isc_rxd_available ifc_txrx.ift_rxd_available
222#define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
223#define isc_rxd_refill ifc_txrx.ift_rxd_refill
224#define isc_rxd_flush ifc_txrx.ift_rxd_flush
225#define isc_rxd_refill ifc_txrx.ift_rxd_refill
226#define isc_rxd_refill ifc_txrx.ift_rxd_refill
227#define isc_legacy_intr ifc_txrx.ift_legacy_intr
228	eventhandler_tag ifc_vlan_attach_event;
229	eventhandler_tag ifc_vlan_detach_event;
230	struct ether_addr ifc_mac;
231};
232
233void *
234iflib_get_softc(if_ctx_t ctx)
235{
236
237	return (ctx->ifc_softc);
238}
239
240device_t
241iflib_get_dev(if_ctx_t ctx)
242{
243
244	return (ctx->ifc_dev);
245}
246
247if_t
248iflib_get_ifp(if_ctx_t ctx)
249{
250
251	return (ctx->ifc_ifp);
252}
253
254struct ifmedia *
255iflib_get_media(if_ctx_t ctx)
256{
257
258	return (ctx->ifc_mediap);
259}
260
261uint32_t
262iflib_get_flags(if_ctx_t ctx)
263{
264	return (ctx->ifc_flags);
265}
266
267void
268iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
269{
270
271	bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN);
272}
273
274if_softc_ctx_t
275iflib_get_softc_ctx(if_ctx_t ctx)
276{
277
278	return (&ctx->ifc_softc_ctx);
279}
280
281if_shared_ctx_t
282iflib_get_sctx(if_ctx_t ctx)
283{
284
285	return (ctx->ifc_sctx);
286}
287
288#define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
289#define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
290#define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
291
292#define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
293#define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
294
295typedef struct iflib_sw_rx_desc_array {
296	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
297	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
298	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
299	bus_addr_t	*ifsd_ba;          /* bus addr of cluster for rx */
300} iflib_rxsd_array_t;
301
302typedef struct iflib_sw_tx_desc_array {
303	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
304	bus_dmamap_t	*ifsd_tso_map;     /* bus_dma maps for TSO packet */
305	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
306} if_txsd_vec_t;
307
308/* magic number that should be high enough for any hardware */
309#define IFLIB_MAX_TX_SEGS		128
310#define IFLIB_RX_COPY_THRESH		128
311#define IFLIB_MAX_RX_REFRESH		32
312/* The minimum descriptors per second before we start coalescing */
313#define IFLIB_MIN_DESC_SEC		16384
314#define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
315#define IFLIB_QUEUE_IDLE		0
316#define IFLIB_QUEUE_HUNG		1
317#define IFLIB_QUEUE_WORKING		2
318/* maximum number of txqs that can share an rx interrupt */
319#define IFLIB_MAX_TX_SHARED_INTR	4
320
321/* this should really scale with ring size - this is a fairly arbitrary value */
322#define TX_BATCH_SIZE			32
323
324#define IFLIB_RESTART_BUDGET		8
325
326#define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
327				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
328				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
329
330struct iflib_txq {
331	qidx_t		ift_in_use;
332	qidx_t		ift_cidx;
333	qidx_t		ift_cidx_processed;
334	qidx_t		ift_pidx;
335	uint8_t		ift_gen;
336	uint8_t		ift_br_offset;
337	uint16_t	ift_npending;
338	uint16_t	ift_db_pending;
339	uint16_t	ift_rs_pending;
340	/* implicit pad */
341	uint8_t		ift_txd_size[8];
342	uint64_t	ift_processed;
343	uint64_t	ift_cleaned;
344	uint64_t	ift_cleaned_prev;
345#if MEMORY_LOGGING
346	uint64_t	ift_enqueued;
347	uint64_t	ift_dequeued;
348#endif
349	uint64_t	ift_no_tx_dma_setup;
350	uint64_t	ift_no_desc_avail;
351	uint64_t	ift_mbuf_defrag_failed;
352	uint64_t	ift_mbuf_defrag;
353	uint64_t	ift_map_failed;
354	uint64_t	ift_txd_encap_efbig;
355	uint64_t	ift_pullups;
356	uint64_t	ift_last_timer_tick;
357
358	struct mtx	ift_mtx;
359	struct mtx	ift_db_mtx;
360
361	/* constant values */
362	if_ctx_t	ift_ctx;
363	struct ifmp_ring        *ift_br;
364	struct grouptask	ift_task;
365	qidx_t		ift_size;
366	uint16_t	ift_id;
367	struct callout	ift_timer;
368
369	if_txsd_vec_t	ift_sds;
370	uint8_t		ift_qstatus;
371	uint8_t		ift_closed;
372	uint8_t		ift_update_freq;
373	struct iflib_filter_info ift_filter_info;
374	bus_dma_tag_t	ift_buf_tag;
375	bus_dma_tag_t	ift_tso_buf_tag;
376	iflib_dma_info_t	ift_ifdi;
377#define MTX_NAME_LEN 16
378	char                    ift_mtx_name[MTX_NAME_LEN];
379	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
380#ifdef IFLIB_DIAGNOSTICS
381	uint64_t ift_cpu_exec_count[256];
382#endif
383} __aligned(CACHE_LINE_SIZE);
384
385struct iflib_fl {
386	qidx_t		ifl_cidx;
387	qidx_t		ifl_pidx;
388	qidx_t		ifl_credits;
389	uint8_t		ifl_gen;
390	uint8_t		ifl_rxd_size;
391#if MEMORY_LOGGING
392	uint64_t	ifl_m_enqueued;
393	uint64_t	ifl_m_dequeued;
394	uint64_t	ifl_cl_enqueued;
395	uint64_t	ifl_cl_dequeued;
396#endif
397	/* implicit pad */
398	bitstr_t 	*ifl_rx_bitmap;
399	qidx_t		ifl_fragidx;
400	/* constant */
401	qidx_t		ifl_size;
402	uint16_t	ifl_buf_size;
403	uint16_t	ifl_cltype;
404#ifndef __HAIKU__
405	uma_zone_t	ifl_zone;
406#endif
407	iflib_rxsd_array_t	ifl_sds;
408	iflib_rxq_t	ifl_rxq;
409	uint8_t		ifl_id;
410	bus_dma_tag_t	ifl_buf_tag;
411	iflib_dma_info_t	ifl_ifdi;
412	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
413	caddr_t		ifl_vm_addrs[IFLIB_MAX_RX_REFRESH];
414	qidx_t	ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
415}  __aligned(CACHE_LINE_SIZE);
416
417static inline qidx_t
418get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
419{
420	qidx_t used;
421
422	if (pidx > cidx)
423		used = pidx - cidx;
424	else if (pidx < cidx)
425		used = size - cidx + pidx;
426	else if (gen == 0 && pidx == cidx)
427		used = 0;
428	else if (gen == 1 && pidx == cidx)
429		used = size;
430	else
431		panic("bad state");
432
433	return (used);
434}
435
436#define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
437
438#define IDXDIFF(head, tail, wrap) \
439	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
440
441struct iflib_rxq {
442	if_ctx_t	ifr_ctx;
443	iflib_fl_t	ifr_fl;
444	uint64_t	ifr_rx_irq;
445#ifndef __HAIKU__
446	struct pfil_head	*pfil;
447#else
448#define PFIL_PASS 0
449#endif
450	/*
451	 * If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
452	 * the command queue consumer index.  Otherwise it's unused.
453	 */
454	qidx_t		ifr_cq_cidx;
455	uint16_t	ifr_id;
456	uint8_t		ifr_nfl;
457	uint8_t		ifr_ntxqirq;
458	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
459	uint8_t		ifr_fl_offset;
460#ifndef __HAIKU__
461	struct lro_ctrl			ifr_lc;
462#endif
463	struct grouptask        ifr_task;
464	struct iflib_filter_info ifr_filter_info;
465	iflib_dma_info_t		ifr_ifdi;
466
467	/* dynamically allocate if any drivers need a value substantially larger than this */
468	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
469#ifdef IFLIB_DIAGNOSTICS
470	uint64_t ifr_cpu_exec_count[256];
471#endif
472}  __aligned(CACHE_LINE_SIZE);
473
474typedef struct if_rxsd {
475	caddr_t *ifsd_cl;
476	iflib_fl_t ifsd_fl;
477	qidx_t ifsd_cidx;
478} *if_rxsd_t;
479
480/* multiple of word size */
481#ifdef __LP64__
482#define PKT_INFO_SIZE	6
483#define RXD_INFO_SIZE	5
484#define PKT_TYPE uint64_t
485#else
486#define PKT_INFO_SIZE	11
487#define RXD_INFO_SIZE	8
488#define PKT_TYPE uint32_t
489#endif
490#define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
491#define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
492
493typedef struct if_pkt_info_pad {
494	PKT_TYPE pkt_val[PKT_INFO_SIZE];
495} *if_pkt_info_pad_t;
496typedef struct if_rxd_info_pad {
497	PKT_TYPE rxd_val[RXD_INFO_SIZE];
498} *if_rxd_info_pad_t;
499
500CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
501CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
502
503
504static inline void
505pkt_info_zero(if_pkt_info_t pi)
506{
507	if_pkt_info_pad_t pi_pad;
508
509	pi_pad = (if_pkt_info_pad_t)pi;
510	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
511	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
512#ifndef __LP64__
513	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
514	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
515#endif
516}
517
518#ifndef __HAIKU__
519static device_method_t iflib_pseudo_methods[] = {
520	DEVMETHOD(device_attach, noop_attach),
521	DEVMETHOD(device_detach, iflib_pseudo_detach),
522	DEVMETHOD_END
523};
524
525driver_t iflib_pseudodriver = {
526	"iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx),
527};
528#endif
529
530static inline void
531rxd_info_zero(if_rxd_info_t ri)
532{
533	if_rxd_info_pad_t ri_pad;
534	int i;
535
536	ri_pad = (if_rxd_info_pad_t)ri;
537	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
538		ri_pad->rxd_val[i] = 0;
539		ri_pad->rxd_val[i+1] = 0;
540		ri_pad->rxd_val[i+2] = 0;
541		ri_pad->rxd_val[i+3] = 0;
542	}
543#ifdef __LP64__
544	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
545#endif
546}
547
548/*
549 * Only allow a single packet to take up most 1/nth of the tx ring
550 */
551#define MAX_SINGLE_PACKET_FRACTION 12
552#define IF_BAD_DMA (bus_addr_t)-1
553
554#define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
555
556#define CTX_LOCK_INIT(_sc)  sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
557#define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
558#define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
559#define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
560
561#define STATE_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
562#define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
563#define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
564#define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
565
566#define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
567#define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
568
569void
570iflib_set_detach(if_ctx_t ctx)
571{
572	STATE_LOCK(ctx);
573	ctx->ifc_flags |= IFC_IN_DETACH;
574	STATE_UNLOCK(ctx);
575}
576
577/* Our boot-time initialization hook */
578static int	iflib_module_event_handler(module_t, int, void *);
579
580#ifndef __HAIKU__
581static moduledata_t iflib_moduledata = {
582	"iflib",
583	iflib_module_event_handler,
584	NULL
585};
586#endif
587
588DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
589MODULE_VERSION(iflib, 1);
590
591MODULE_DEPEND(iflib, pci, 1, 1, 1);
592MODULE_DEPEND(iflib, ether, 1, 1, 1);
593
594TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
595TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
596
597#ifndef IFLIB_DEBUG_COUNTERS
598#ifdef INVARIANTS
599#define IFLIB_DEBUG_COUNTERS 1
600#else
601#define IFLIB_DEBUG_COUNTERS 0
602#endif /* !INVARIANTS */
603#endif
604
605static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0,
606                   "iflib driver parameters");
607
608/*
609 * XXX need to ensure that this can't accidentally cause the head to be moved backwards
610 */
611static int iflib_min_tx_latency = 0;
612SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
613		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
614static int iflib_no_tx_batch = 0;
615SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
616		   &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
617
618
619#if IFLIB_DEBUG_COUNTERS
620
621static int iflib_tx_seen;
622static int iflib_tx_sent;
623static int iflib_tx_encap;
624static int iflib_rx_allocs;
625static int iflib_fl_refills;
626static int iflib_fl_refills_large;
627static int iflib_tx_frees;
628
629SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
630		   &iflib_tx_seen, 0, "# TX mbufs seen");
631SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
632		   &iflib_tx_sent, 0, "# TX mbufs sent");
633SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
634		   &iflib_tx_encap, 0, "# TX mbufs encapped");
635SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
636		   &iflib_tx_frees, 0, "# TX frees");
637SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
638		   &iflib_rx_allocs, 0, "# RX allocations");
639SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
640		   &iflib_fl_refills, 0, "# refills");
641SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
642		   &iflib_fl_refills_large, 0, "# large refills");
643
644
645static int iflib_txq_drain_flushing;
646static int iflib_txq_drain_oactive;
647static int iflib_txq_drain_notready;
648
649SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
650		   &iflib_txq_drain_flushing, 0, "# drain flushes");
651SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
652		   &iflib_txq_drain_oactive, 0, "# drain oactives");
653SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
654		   &iflib_txq_drain_notready, 0, "# drain notready");
655
656
657static int iflib_encap_load_mbuf_fail;
658static int iflib_encap_pad_mbuf_fail;
659static int iflib_encap_txq_avail_fail;
660static int iflib_encap_txd_encap_fail;
661
662SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
663		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
664SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
665		   &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
666SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
667		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
668SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
669		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
670
671static int iflib_task_fn_rxs;
672static int iflib_rx_intr_enables;
673static int iflib_fast_intrs;
674static int iflib_rx_unavail;
675static int iflib_rx_ctx_inactive;
676static int iflib_rx_if_input;
677static int iflib_rxd_flush;
678
679static int iflib_verbose_debug;
680
681SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
682		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
683SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
684		   &iflib_rx_intr_enables, 0, "# RX intr enables");
685SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
686		   &iflib_fast_intrs, 0, "# fast_intr calls");
687SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
688		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
689SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
690		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
691SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
692		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
693SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
694	         &iflib_rxd_flush, 0, "# times rxd_flush called");
695SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
696		   &iflib_verbose_debug, 0, "enable verbose debugging");
697
698#define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
699static void
700iflib_debug_reset(void)
701{
702	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
703		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
704		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
705		iflib_txq_drain_notready =
706		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
707		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
708		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
709		iflib_rx_unavail =
710		iflib_rx_ctx_inactive = iflib_rx_if_input =
711		iflib_rxd_flush = 0;
712}
713
714#else
715#define DBG_COUNTER_INC(name)
716static void iflib_debug_reset(void) {}
717#endif
718
719#define IFLIB_DEBUG 0
720
721static void iflib_tx_structures_free(if_ctx_t ctx);
722static void iflib_rx_structures_free(if_ctx_t ctx);
723static int iflib_queues_alloc(if_ctx_t ctx);
724static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
725static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
726static int iflib_qset_structures_setup(if_ctx_t ctx);
727static int iflib_msix_init(if_ctx_t ctx);
728static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
729static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
730static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
731#ifdef ALTQ
732static void iflib_altq_if_start(if_t ifp);
733static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
734#endif
735static int iflib_register(if_ctx_t);
736static void iflib_deregister(if_ctx_t);
737static void iflib_init_locked(if_ctx_t ctx);
738static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
739static void iflib_add_device_sysctl_post(if_ctx_t ctx);
740static void iflib_ifmp_purge(iflib_txq_t txq);
741static void _iflib_pre_assert(if_softc_ctx_t scctx);
742static void iflib_if_init_locked(if_ctx_t ctx);
743static void iflib_free_intr_mem(if_ctx_t ctx);
744#ifndef __NO_STRICT_ALIGNMENT
745static struct mbuf * iflib_fixup_rx(struct mbuf *m);
746#endif
747
748#ifndef __HAIKU__
749static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
750    SLIST_HEAD_INITIALIZER(cpu_offsets);
751struct cpu_offset {
752	SLIST_ENTRY(cpu_offset) entries;
753	cpuset_t	set;
754	unsigned int	refcount;
755	uint16_t	offset;
756};
757static struct mtx cpu_offset_mtx;
758MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
759    MTX_DEF);
760#endif
761
762NETDUMP_DEFINE(iflib);
763
764#ifdef DEV_NETMAP
765#include <sys/selinfo.h>
766#include <net/netmap.h>
767#include <dev/netmap/netmap_kern.h>
768
769MODULE_DEPEND(iflib, netmap, 1, 1, 1);
770
771static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init);
772
773/*
774 * device-specific sysctl variables:
775 *
776 * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
777 *	During regular operations the CRC is stripped, but on some
778 *	hardware reception of frames not multiple of 64 is slower,
779 *	so using crcstrip=0 helps in benchmarks.
780 *
781 * iflib_rx_miss, iflib_rx_miss_bufs:
782 *	count packets that might be missed due to lost interrupts.
783 */
784SYSCTL_DECL(_dev_netmap);
785/*
786 * The xl driver by default strips CRCs and we do not override it.
787 */
788
789int iflib_crcstrip = 1;
790SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
791    CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
792
793int iflib_rx_miss, iflib_rx_miss_bufs;
794SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
795    CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
796SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
797    CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
798
799/*
800 * Register/unregister. We are already under netmap lock.
801 * Only called on the first register or the last unregister.
802 */
803static int
804iflib_netmap_register(struct netmap_adapter *na, int onoff)
805{
806	if_t ifp = na->ifp;
807	if_ctx_t ctx = ifp->if_softc;
808	int status;
809
810	CTX_LOCK(ctx);
811	IFDI_INTR_DISABLE(ctx);
812
813	/* Tell the stack that the interface is no longer active */
814	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
815
816	if (!CTX_IS_VF(ctx))
817		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
818
819	/* enable or disable flags and callbacks in na and ifp */
820	if (onoff) {
821		nm_set_native_flags(na);
822	} else {
823		nm_clear_native_flags(na);
824	}
825	iflib_stop(ctx);
826	iflib_init_locked(ctx);
827	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
828	status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1;
829	if (status)
830		nm_clear_native_flags(na);
831	CTX_UNLOCK(ctx);
832	return (status);
833}
834
835static int
836netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, uint32_t nm_i, bool init)
837{
838	struct netmap_adapter *na = kring->na;
839	u_int const lim = kring->nkr_num_slots - 1;
840	u_int head = kring->rhead;
841	struct netmap_ring *ring = kring->ring;
842	bus_dmamap_t *map;
843	struct if_rxd_update iru;
844	if_ctx_t ctx = rxq->ifr_ctx;
845	iflib_fl_t fl = &rxq->ifr_fl[0];
846	uint32_t refill_pidx, nic_i;
847#if IFLIB_DEBUG_COUNTERS
848	int rf_count = 0;
849#endif
850
851	if (nm_i == head && __predict_true(!init))
852		return 0;
853	iru_init(&iru, rxq, 0 /* flid */);
854	map = fl->ifl_sds.ifsd_map;
855	refill_pidx = netmap_idx_k2n(kring, nm_i);
856	/*
857	 * IMPORTANT: we must leave one free slot in the ring,
858	 * so move head back by one unit
859	 */
860	head = nm_prev(head, lim);
861	nic_i = UINT_MAX;
862	DBG_COUNTER_INC(fl_refills);
863	while (nm_i != head) {
864#if IFLIB_DEBUG_COUNTERS
865		if (++rf_count == 9)
866			DBG_COUNTER_INC(fl_refills_large);
867#endif
868		for (int tmp_pidx = 0; tmp_pidx < IFLIB_MAX_RX_REFRESH && nm_i != head; tmp_pidx++) {
869			struct netmap_slot *slot = &ring->slot[nm_i];
870			void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[tmp_pidx]);
871			uint32_t nic_i_dma = refill_pidx;
872			nic_i = netmap_idx_k2n(kring, nm_i);
873
874			MPASS(tmp_pidx < IFLIB_MAX_RX_REFRESH);
875
876			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
877			        return netmap_ring_reinit(kring);
878
879			fl->ifl_vm_addrs[tmp_pidx] = addr;
880			if (__predict_false(init)) {
881				netmap_load_map(na, fl->ifl_buf_tag,
882				    map[nic_i], addr);
883			} else if (slot->flags & NS_BUF_CHANGED) {
884				/* buffer has changed, reload map */
885				netmap_reload_map(na, fl->ifl_buf_tag,
886				    map[nic_i], addr);
887			}
888			slot->flags &= ~NS_BUF_CHANGED;
889
890			nm_i = nm_next(nm_i, lim);
891			fl->ifl_rxd_idxs[tmp_pidx] = nic_i = nm_next(nic_i, lim);
892			if (nm_i != head && tmp_pidx < IFLIB_MAX_RX_REFRESH-1)
893				continue;
894
895			iru.iru_pidx = refill_pidx;
896			iru.iru_count = tmp_pidx+1;
897			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
898			refill_pidx = nic_i;
899			for (int n = 0; n < iru.iru_count; n++) {
900				bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i_dma],
901						BUS_DMASYNC_PREREAD);
902				/* XXX - change this to not use the netmap func*/
903				nic_i_dma = nm_next(nic_i_dma, lim);
904			}
905		}
906	}
907	kring->nr_hwcur = head;
908
909	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
910	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
911	if (__predict_true(nic_i != UINT_MAX)) {
912		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
913		DBG_COUNTER_INC(rxd_flush);
914	}
915	return (0);
916}
917
918/*
919 * Reconcile kernel and user view of the transmit ring.
920 *
921 * All information is in the kring.
922 * Userspace wants to send packets up to the one before kring->rhead,
923 * kernel knows kring->nr_hwcur is the first unsent packet.
924 *
925 * Here we push packets out (as many as possible), and possibly
926 * reclaim buffers from previously completed transmission.
927 *
928 * The caller (netmap) guarantees that there is only one instance
929 * running at any time. Any interference with other driver
930 * methods should be handled by the individual drivers.
931 */
932static int
933iflib_netmap_txsync(struct netmap_kring *kring, int flags)
934{
935	struct netmap_adapter *na = kring->na;
936	if_t ifp = na->ifp;
937	struct netmap_ring *ring = kring->ring;
938	u_int nm_i;	/* index into the netmap kring */
939	u_int nic_i;	/* index into the NIC ring */
940	u_int n;
941	u_int const lim = kring->nkr_num_slots - 1;
942	u_int const head = kring->rhead;
943	struct if_pkt_info pi;
944
945	/*
946	 * interrupts on every tx packet are expensive so request
947	 * them every half ring, or where NS_REPORT is set
948	 */
949	u_int report_frequency = kring->nkr_num_slots >> 1;
950	/* device-specific */
951	if_ctx_t ctx = ifp->if_softc;
952	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
953
954	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
955	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
956
957	/*
958	 * First part: process new packets to send.
959	 * nm_i is the current index in the netmap kring,
960	 * nic_i is the corresponding index in the NIC ring.
961	 *
962	 * If we have packets to send (nm_i != head)
963	 * iterate over the netmap ring, fetch length and update
964	 * the corresponding slot in the NIC ring. Some drivers also
965	 * need to update the buffer's physical address in the NIC slot
966	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
967	 *
968	 * The netmap_reload_map() calls is especially expensive,
969	 * even when (as in this case) the tag is 0, so do only
970	 * when the buffer has actually changed.
971	 *
972	 * If possible do not set the report/intr bit on all slots,
973	 * but only a few times per ring or when NS_REPORT is set.
974	 *
975	 * Finally, on 10G and faster drivers, it might be useful
976	 * to prefetch the next slot and txr entry.
977	 */
978
979	nm_i = kring->nr_hwcur;
980	if (nm_i != head) {	/* we have new packets to send */
981		pkt_info_zero(&pi);
982		pi.ipi_segs = txq->ift_segs;
983		pi.ipi_qsidx = kring->ring_id;
984		nic_i = netmap_idx_k2n(kring, nm_i);
985
986		__builtin_prefetch(&ring->slot[nm_i]);
987		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
988		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
989
990		for (n = 0; nm_i != head; n++) {
991			struct netmap_slot *slot = &ring->slot[nm_i];
992			u_int len = slot->len;
993			uint64_t paddr;
994			void *addr = PNMB(na, slot, &paddr);
995			int flags = (slot->flags & NS_REPORT ||
996				nic_i == 0 || nic_i == report_frequency) ?
997				IPI_TX_INTR : 0;
998
999			/* device-specific */
1000			pi.ipi_len = len;
1001			pi.ipi_segs[0].ds_addr = paddr;
1002			pi.ipi_segs[0].ds_len = len;
1003			pi.ipi_nsegs = 1;
1004			pi.ipi_ndescs = 0;
1005			pi.ipi_pidx = nic_i;
1006			pi.ipi_flags = flags;
1007
1008			/* Fill the slot in the NIC ring. */
1009			ctx->isc_txd_encap(ctx->ifc_softc, &pi);
1010			DBG_COUNTER_INC(tx_encap);
1011
1012			/* prefetch for next round */
1013			__builtin_prefetch(&ring->slot[nm_i + 1]);
1014			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
1015			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
1016
1017			NM_CHECK_ADDR_LEN(na, addr, len);
1018
1019			if (slot->flags & NS_BUF_CHANGED) {
1020				/* buffer has changed, reload map */
1021				netmap_reload_map(na, txq->ift_buf_tag,
1022				    txq->ift_sds.ifsd_map[nic_i], addr);
1023			}
1024			/* make sure changes to the buffer are synced */
1025			bus_dmamap_sync(txq->ift_buf_tag,
1026			    txq->ift_sds.ifsd_map[nic_i],
1027			    BUS_DMASYNC_PREWRITE);
1028
1029			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
1030			nm_i = nm_next(nm_i, lim);
1031			nic_i = nm_next(nic_i, lim);
1032		}
1033		kring->nr_hwcur = nm_i;
1034
1035		/* synchronize the NIC ring */
1036		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1037		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1038
1039		/* (re)start the tx unit up to slot nic_i (excluded) */
1040		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
1041	}
1042
1043	/*
1044	 * Second part: reclaim buffers for completed transmissions.
1045	 *
1046	 * If there are unclaimed buffers, attempt to reclaim them.
1047	 * If none are reclaimed, and TX IRQs are not in use, do an initial
1048	 * minimal delay, then trigger the tx handler which will spin in the
1049	 * group task queue.
1050	 */
1051	if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
1052		if (iflib_tx_credits_update(ctx, txq)) {
1053			/* some tx completed, increment avail */
1054			nic_i = txq->ift_cidx_processed;
1055			kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
1056		}
1057	}
1058	if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
1059		if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
1060			callout_reset_on(&txq->ift_timer, hz < 2000 ? 1 : hz / 1000,
1061			    iflib_timer, txq, txq->ift_timer.c_cpu);
1062	}
1063	return (0);
1064}
1065
1066/*
1067 * Reconcile kernel and user view of the receive ring.
1068 * Same as for the txsync, this routine must be efficient.
1069 * The caller guarantees a single invocations, but races against
1070 * the rest of the driver should be handled here.
1071 *
1072 * On call, kring->rhead is the first packet that userspace wants
1073 * to keep, and kring->rcur is the wakeup point.
1074 * The kernel has previously reported packets up to kring->rtail.
1075 *
1076 * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
1077 * of whether or not we received an interrupt.
1078 */
1079static int
1080iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
1081{
1082	struct netmap_adapter *na = kring->na;
1083	struct netmap_ring *ring = kring->ring;
1084	if_t ifp = na->ifp;
1085	iflib_fl_t fl;
1086	uint32_t nm_i;	/* index into the netmap ring */
1087	uint32_t nic_i;	/* index into the NIC ring */
1088	u_int i, n;
1089	u_int const lim = kring->nkr_num_slots - 1;
1090	u_int const head = kring->rhead;
1091	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
1092	struct if_rxd_info ri;
1093
1094	if_ctx_t ctx = ifp->if_softc;
1095	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
1096	if (head > lim)
1097		return netmap_ring_reinit(kring);
1098
1099	/*
1100	 * XXX netmap_fl_refill() only ever (re)fills free list 0 so far.
1101	 */
1102
1103	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++) {
1104		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
1105		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1106	}
1107
1108	/*
1109	 * First part: import newly received packets.
1110	 *
1111	 * nm_i is the index of the next free slot in the netmap ring,
1112	 * nic_i is the index of the next received packet in the NIC ring,
1113	 * and they may differ in case if_init() has been called while
1114	 * in netmap mode. For the receive ring we have
1115	 *
1116	 *	nic_i = rxr->next_check;
1117	 *	nm_i = kring->nr_hwtail (previous)
1118	 * and
1119	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1120	 *
1121	 * rxr->next_check is set to 0 on a ring reinit
1122	 */
1123	if (netmap_no_pendintr || force_update) {
1124		int crclen = iflib_crcstrip ? 0 : 4;
1125		int error, avail;
1126
1127		for (i = 0; i < rxq->ifr_nfl; i++) {
1128			fl = &rxq->ifr_fl[i];
1129			nic_i = fl->ifl_cidx;
1130			nm_i = netmap_idx_n2k(kring, nic_i);
1131			avail = ctx->isc_rxd_available(ctx->ifc_softc,
1132			    rxq->ifr_id, nic_i, USHRT_MAX);
1133			for (n = 0; avail > 0; n++, avail--) {
1134				rxd_info_zero(&ri);
1135				ri.iri_frags = rxq->ifr_frags;
1136				ri.iri_qsidx = kring->ring_id;
1137				ri.iri_ifp = ctx->ifc_ifp;
1138				ri.iri_cidx = nic_i;
1139
1140				error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
1141				ring->slot[nm_i].len = error ? 0 : ri.iri_len - crclen;
1142				ring->slot[nm_i].flags = 0;
1143				bus_dmamap_sync(fl->ifl_buf_tag,
1144				    fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
1145				nm_i = nm_next(nm_i, lim);
1146				nic_i = nm_next(nic_i, lim);
1147			}
1148			if (n) { /* update the state variables */
1149				if (netmap_no_pendintr && !force_update) {
1150					/* diagnostics */
1151					iflib_rx_miss ++;
1152					iflib_rx_miss_bufs += n;
1153				}
1154				fl->ifl_cidx = nic_i;
1155				kring->nr_hwtail = nm_i;
1156			}
1157			kring->nr_kflags &= ~NKR_PENDINTR;
1158		}
1159	}
1160	/*
1161	 * Second part: skip past packets that userspace has released.
1162	 * (kring->nr_hwcur to head excluded),
1163	 * and make the buffers available for reception.
1164	 * As usual nm_i is the index in the netmap ring,
1165	 * nic_i is the index in the NIC ring, and
1166	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1167	 */
1168	/* XXX not sure how this will work with multiple free lists */
1169	nm_i = kring->nr_hwcur;
1170
1171	return (netmap_fl_refill(rxq, kring, nm_i, false));
1172}
1173
1174static void
1175iflib_netmap_intr(struct netmap_adapter *na, int onoff)
1176{
1177	if_ctx_t ctx = na->ifp->if_softc;
1178
1179	CTX_LOCK(ctx);
1180	if (onoff) {
1181		IFDI_INTR_ENABLE(ctx);
1182	} else {
1183		IFDI_INTR_DISABLE(ctx);
1184	}
1185	CTX_UNLOCK(ctx);
1186}
1187
1188
1189static int
1190iflib_netmap_attach(if_ctx_t ctx)
1191{
1192	struct netmap_adapter na;
1193	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1194
1195	bzero(&na, sizeof(na));
1196
1197	na.ifp = ctx->ifc_ifp;
1198	na.na_flags = NAF_BDG_MAYSLEEP;
1199	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
1200	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
1201
1202	na.num_tx_desc = scctx->isc_ntxd[0];
1203	na.num_rx_desc = scctx->isc_nrxd[0];
1204	na.nm_txsync = iflib_netmap_txsync;
1205	na.nm_rxsync = iflib_netmap_rxsync;
1206	na.nm_register = iflib_netmap_register;
1207	na.nm_intr = iflib_netmap_intr;
1208	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
1209	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
1210	return (netmap_attach(&na));
1211}
1212
1213static void
1214iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
1215{
1216	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1217	struct netmap_slot *slot;
1218
1219	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
1220	if (slot == NULL)
1221		return;
1222	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
1223
1224		/*
1225		 * In netmap mode, set the map for the packet buffer.
1226		 * NOTE: Some drivers (not this one) also need to set
1227		 * the physical buffer address in the NIC ring.
1228		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
1229		 * netmap slot index, si
1230		 */
1231		int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
1232		netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
1233		    NMB(na, slot + si));
1234	}
1235}
1236
1237static void
1238iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
1239{
1240	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1241	struct netmap_kring *kring = na->rx_rings[rxq->ifr_id];
1242	struct netmap_slot *slot;
1243	uint32_t nm_i;
1244
1245	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
1246	if (slot == NULL)
1247		return;
1248	nm_i = netmap_idx_n2k(kring, 0);
1249	netmap_fl_refill(rxq, kring, nm_i, true);
1250}
1251
1252static void
1253iflib_netmap_timer_adjust(if_ctx_t ctx, iflib_txq_t txq, uint32_t *reset_on)
1254{
1255	struct netmap_kring *kring;
1256	uint16_t txqid;
1257
1258	txqid = txq->ift_id;
1259	kring = NA(ctx->ifc_ifp)->tx_rings[txqid];
1260
1261	if (kring->nr_hwcur != nm_next(kring->nr_hwtail, kring->nkr_num_slots - 1)) {
1262		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1263		    BUS_DMASYNC_POSTREAD);
1264		if (ctx->isc_txd_credits_update(ctx->ifc_softc, txqid, false))
1265			netmap_tx_irq(ctx->ifc_ifp, txqid);
1266		if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ)) {
1267			if (hz < 2000)
1268				*reset_on = 1;
1269			else
1270				*reset_on = hz / 1000;
1271		}
1272	}
1273}
1274
1275#define iflib_netmap_detach(ifp) netmap_detach(ifp)
1276
1277#else
1278#define iflib_netmap_txq_init(ctx, txq)
1279#define iflib_netmap_rxq_init(ctx, rxq)
1280#define iflib_netmap_detach(ifp)
1281
1282#define iflib_netmap_attach(ctx) (0)
1283#define netmap_rx_irq(ifp, qid, budget) (0)
1284#define netmap_tx_irq(ifp, qid) do {} while (0)
1285#define iflib_netmap_timer_adjust(ctx, txq, reset_on)
1286#endif
1287
1288#if defined(__i386__) || defined(__amd64__)
1289static __inline void
1290prefetch(void *x)
1291{
1292	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1293}
1294static __inline void
1295prefetch2cachelines(void *x)
1296{
1297	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1298#if (CACHE_LINE_SIZE < 128)
1299	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
1300#endif
1301}
1302#else
1303#define prefetch(x)
1304#define prefetch2cachelines(x)
1305#endif
1306
1307static void
1308iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
1309{
1310	iflib_fl_t fl;
1311
1312	fl = &rxq->ifr_fl[flid];
1313	iru->iru_paddrs = fl->ifl_bus_addrs;
1314	iru->iru_vaddrs = &fl->ifl_vm_addrs[0];
1315	iru->iru_idxs = fl->ifl_rxd_idxs;
1316	iru->iru_qsidx = rxq->ifr_id;
1317	iru->iru_buf_size = fl->ifl_buf_size;
1318	iru->iru_flidx = fl->ifl_id;
1319}
1320
1321static void
1322_iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1323{
1324	if (err)
1325		return;
1326	*(bus_addr_t *) arg = segs[0].ds_addr;
1327}
1328
1329int
1330iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
1331{
1332	int err;
1333	device_t dev = ctx->ifc_dev;
1334
1335	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
1336				align, 0,		/* alignment, bounds */
1337				BUS_SPACE_MAXADDR,	/* lowaddr */
1338				BUS_SPACE_MAXADDR,	/* highaddr */
1339				NULL, NULL,		/* filter, filterarg */
1340				size,			/* maxsize */
1341				1,			/* nsegments */
1342				size,			/* maxsegsize */
1343				BUS_DMA_ALLOCNOW,	/* flags */
1344				NULL,			/* lockfunc */
1345				NULL,			/* lockarg */
1346				&dma->idi_tag);
1347	if (err) {
1348		device_printf(dev,
1349		    "%s: bus_dma_tag_create failed: %d\n",
1350		    __func__, err);
1351		goto fail_0;
1352	}
1353
1354	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
1355	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
1356	if (err) {
1357		device_printf(dev,
1358		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
1359		    __func__, (uintmax_t)size, err);
1360		goto fail_1;
1361	}
1362
1363	dma->idi_paddr = IF_BAD_DMA;
1364	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
1365	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
1366	if (err || dma->idi_paddr == IF_BAD_DMA) {
1367		device_printf(dev,
1368		    "%s: bus_dmamap_load failed: %d\n",
1369		    __func__, err);
1370		goto fail_2;
1371	}
1372
1373	dma->idi_size = size;
1374	return (0);
1375
1376fail_2:
1377	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1378fail_1:
1379	bus_dma_tag_destroy(dma->idi_tag);
1380fail_0:
1381	dma->idi_tag = NULL;
1382
1383	return (err);
1384}
1385
1386int
1387iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
1388{
1389	if_shared_ctx_t sctx = ctx->ifc_sctx;
1390
1391	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
1392
1393	return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
1394}
1395
1396int
1397iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
1398{
1399	int i, err;
1400	iflib_dma_info_t *dmaiter;
1401
1402	dmaiter = dmalist;
1403	for (i = 0; i < count; i++, dmaiter++) {
1404		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
1405			break;
1406	}
1407	if (err)
1408		iflib_dma_free_multi(dmalist, i);
1409	return (err);
1410}
1411
1412void
1413iflib_dma_free(iflib_dma_info_t dma)
1414{
1415	if (dma->idi_tag == NULL)
1416		return;
1417	if (dma->idi_paddr != IF_BAD_DMA) {
1418		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
1419		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1420		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
1421		dma->idi_paddr = IF_BAD_DMA;
1422	}
1423	if (dma->idi_vaddr != NULL) {
1424		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1425		dma->idi_vaddr = NULL;
1426	}
1427	bus_dma_tag_destroy(dma->idi_tag);
1428	dma->idi_tag = NULL;
1429}
1430
1431void
1432iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
1433{
1434	int i;
1435	iflib_dma_info_t *dmaiter = dmalist;
1436
1437	for (i = 0; i < count; i++, dmaiter++)
1438		iflib_dma_free(*dmaiter);
1439}
1440
1441#ifdef EARLY_AP_STARTUP
1442static const int iflib_started = 1;
1443#else
1444/*
1445 * We used to abuse the smp_started flag to decide if the queues have been
1446 * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
1447 * That gave bad races, since the SYSINIT() runs strictly after smp_started
1448 * is set.  Run a SYSINIT() strictly after that to just set a usable
1449 * completion flag.
1450 */
1451
1452static int iflib_started;
1453
1454static void
1455iflib_record_started(void *arg)
1456{
1457	iflib_started = 1;
1458}
1459
1460SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
1461	iflib_record_started, NULL);
1462#endif
1463
1464static int
1465iflib_fast_intr(void *arg)
1466{
1467	iflib_filter_info_t info = arg;
1468	struct grouptask *gtask = info->ifi_task;
1469	int result;
1470
1471	if (!iflib_started)
1472		return (FILTER_STRAY);
1473
1474	DBG_COUNTER_INC(fast_intrs);
1475	if (info->ifi_filter != NULL) {
1476		result = info->ifi_filter(info->ifi_filter_arg);
1477		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1478			return (result);
1479	}
1480
1481	GROUPTASK_ENQUEUE(gtask);
1482	return (FILTER_SCHEDULE_THREAD);
1483}
1484
1485static int
1486iflib_fast_intr_rxtx(void *arg)
1487{
1488	iflib_filter_info_t info = arg;
1489	struct grouptask *gtask = info->ifi_task;
1490	if_ctx_t ctx;
1491	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
1492	iflib_txq_t txq;
1493	void *sc;
1494	int i, cidx, result;
1495	qidx_t txqid;
1496	bool intr_enable, intr_legacy;
1497
1498	if (!iflib_started)
1499		return (FILTER_STRAY);
1500
1501	DBG_COUNTER_INC(fast_intrs);
1502	if (info->ifi_filter != NULL) {
1503		result = info->ifi_filter(info->ifi_filter_arg);
1504		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1505			return (result);
1506	}
1507
1508	ctx = rxq->ifr_ctx;
1509	sc = ctx->ifc_softc;
1510	intr_enable = false;
1511	intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY);
1512	MPASS(rxq->ifr_ntxqirq);
1513	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
1514		txqid = rxq->ifr_txqid[i];
1515		txq = &ctx->ifc_txqs[txqid];
1516		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1517		    BUS_DMASYNC_POSTREAD);
1518		if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
1519			if (intr_legacy)
1520				intr_enable = true;
1521			else
1522				IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
1523			continue;
1524		}
1525		GROUPTASK_ENQUEUE(&txq->ift_task);
1526	}
1527	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
1528		cidx = rxq->ifr_cq_cidx;
1529	else
1530		cidx = rxq->ifr_fl[0].ifl_cidx;
1531	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
1532		GROUPTASK_ENQUEUE(gtask);
1533	else {
1534		if (intr_legacy)
1535			intr_enable = true;
1536		else
1537			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
1538		DBG_COUNTER_INC(rx_intr_enables);
1539	}
1540	if (intr_enable)
1541		IFDI_INTR_ENABLE(ctx);
1542	return (FILTER_SCHEDULE_THREAD);
1543}
1544
1545
1546static int
1547iflib_fast_intr_ctx(void *arg)
1548{
1549	iflib_filter_info_t info = arg;
1550	struct grouptask *gtask = info->ifi_task;
1551	int result;
1552
1553	if (!iflib_started)
1554		return (FILTER_STRAY);
1555
1556	DBG_COUNTER_INC(fast_intrs);
1557	if (info->ifi_filter != NULL) {
1558		result = info->ifi_filter(info->ifi_filter_arg);
1559		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1560			return (result);
1561	}
1562
1563	GROUPTASK_ENQUEUE(gtask);
1564	return (FILTER_SCHEDULE_THREAD);
1565}
1566
1567static int
1568_iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
1569		 driver_filter_t filter, driver_intr_t handler, void *arg,
1570		 const char *name)
1571{
1572	struct resource *res;
1573	void *tag = NULL;
1574	device_t dev = ctx->ifc_dev;
1575	int flags, i, rc;
1576
1577	flags = RF_ACTIVE;
1578	if (ctx->ifc_flags & IFC_LEGACY)
1579		flags |= RF_SHAREABLE;
1580	MPASS(rid < 512);
1581	i = rid;
1582	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, flags);
1583	if (res == NULL) {
1584		device_printf(dev,
1585		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
1586		return (ENOMEM);
1587	}
1588	irq->ii_res = res;
1589	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
1590	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
1591						filter, handler, arg, &tag);
1592	if (rc != 0) {
1593		device_printf(dev,
1594		    "failed to setup interrupt for rid %d, name %s: %d\n",
1595					  rid, name ? name : "unknown", rc);
1596		return (rc);
1597	} else if (name)
1598		bus_describe_intr(dev, res, tag, "%s", name);
1599
1600	irq->ii_tag = tag;
1601	return (0);
1602}
1603
1604
1605/*********************************************************************
1606 *
1607 *  Allocate DMA resources for TX buffers as well as memory for the TX
1608 *  mbuf map.  TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
1609 *  iflib_sw_tx_desc_array structure, storing all the information that
1610 *  is needed to transmit a packet on the wire.  This is called only
1611 *  once at attach, setup is done every reset.
1612 *
1613 **********************************************************************/
1614static int
1615iflib_txsd_alloc(iflib_txq_t txq)
1616{
1617	if_ctx_t ctx = txq->ift_ctx;
1618	if_shared_ctx_t sctx = ctx->ifc_sctx;
1619	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1620	device_t dev = ctx->ifc_dev;
1621	bus_size_t tsomaxsize;
1622	int err, nsegments, ntsosegments;
1623	bool tso;
1624	int i;
1625
1626	nsegments = scctx->isc_tx_nsegments;
1627	ntsosegments = scctx->isc_tx_tso_segments_max;
1628	tsomaxsize = scctx->isc_tx_tso_size_max;
1629	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
1630		tsomaxsize += sizeof(struct ether_vlan_header);
1631	MPASS(scctx->isc_ntxd[0] > 0);
1632	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
1633	MPASS(nsegments > 0);
1634	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
1635		MPASS(ntsosegments > 0);
1636		MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
1637	}
1638
1639	/*
1640	 * Set up DMA tags for TX buffers.
1641	 */
1642	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
1643			       1, 0,			/* alignment, bounds */
1644			       BUS_SPACE_MAXADDR,	/* lowaddr */
1645			       BUS_SPACE_MAXADDR,	/* highaddr */
1646			       NULL, NULL,		/* filter, filterarg */
1647			       sctx->isc_tx_maxsize,		/* maxsize */
1648			       nsegments,	/* nsegments */
1649			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
1650			       0,			/* flags */
1651			       NULL,			/* lockfunc */
1652			       NULL,			/* lockfuncarg */
1653			       &txq->ift_buf_tag))) {
1654		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
1655		device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
1656		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
1657		goto fail;
1658	}
1659	tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
1660	if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
1661			       1, 0,			/* alignment, bounds */
1662			       BUS_SPACE_MAXADDR,	/* lowaddr */
1663			       BUS_SPACE_MAXADDR,	/* highaddr */
1664			       NULL, NULL,		/* filter, filterarg */
1665			       tsomaxsize,		/* maxsize */
1666			       ntsosegments,	/* nsegments */
1667			       sctx->isc_tso_maxsegsize,/* maxsegsize */
1668			       0,			/* flags */
1669			       NULL,			/* lockfunc */
1670			       NULL,			/* lockfuncarg */
1671			       &txq->ift_tso_buf_tag))) {
1672		device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
1673		    err);
1674		goto fail;
1675	}
1676
1677	/* Allocate memory for the TX mbuf map. */
1678	if (!(txq->ift_sds.ifsd_m =
1679	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
1680	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1681		device_printf(dev, "Unable to allocate TX mbuf map memory\n");
1682		err = ENOMEM;
1683		goto fail;
1684	}
1685
1686	/*
1687	 * Create the DMA maps for TX buffers.
1688	 */
1689	if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
1690	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
1691	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
1692		device_printf(dev,
1693		    "Unable to allocate TX buffer DMA map memory\n");
1694		err = ENOMEM;
1695		goto fail;
1696	}
1697	if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
1698	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
1699	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
1700		device_printf(dev,
1701		    "Unable to allocate TSO TX buffer map memory\n");
1702		err = ENOMEM;
1703		goto fail;
1704	}
1705	for (i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
1706		err = bus_dmamap_create(txq->ift_buf_tag, 0,
1707		    &txq->ift_sds.ifsd_map[i]);
1708		if (err != 0) {
1709			device_printf(dev, "Unable to create TX DMA map\n");
1710			goto fail;
1711		}
1712		if (!tso)
1713			continue;
1714		err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
1715		    &txq->ift_sds.ifsd_tso_map[i]);
1716		if (err != 0) {
1717			device_printf(dev, "Unable to create TSO TX DMA map\n");
1718			goto fail;
1719		}
1720	}
1721	return (0);
1722fail:
1723	/* We free all, it handles case where we are in the middle */
1724	iflib_tx_structures_free(ctx);
1725	return (err);
1726}
1727
1728static void
1729iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
1730{
1731	bus_dmamap_t map;
1732
1733	map = NULL;
1734	if (txq->ift_sds.ifsd_map != NULL)
1735		map = txq->ift_sds.ifsd_map[i];
1736	if (map != NULL) {
1737		bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
1738		bus_dmamap_unload(txq->ift_buf_tag, map);
1739		bus_dmamap_destroy(txq->ift_buf_tag, map);
1740		txq->ift_sds.ifsd_map[i] = NULL;
1741	}
1742
1743	map = NULL;
1744	if (txq->ift_sds.ifsd_tso_map != NULL)
1745		map = txq->ift_sds.ifsd_tso_map[i];
1746	if (map != NULL) {
1747		bus_dmamap_sync(txq->ift_tso_buf_tag, map,
1748		    BUS_DMASYNC_POSTWRITE);
1749		bus_dmamap_unload(txq->ift_tso_buf_tag, map);
1750		bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
1751		txq->ift_sds.ifsd_tso_map[i] = NULL;
1752	}
1753}
1754
1755static void
1756iflib_txq_destroy(iflib_txq_t txq)
1757{
1758	if_ctx_t ctx = txq->ift_ctx;
1759	int i;
1760
1761	for (i = 0; i < txq->ift_size; i++)
1762		iflib_txsd_destroy(ctx, txq, i);
1763	if (txq->ift_sds.ifsd_map != NULL) {
1764		free(txq->ift_sds.ifsd_map, M_IFLIB);
1765		txq->ift_sds.ifsd_map = NULL;
1766	}
1767	if (txq->ift_sds.ifsd_tso_map != NULL) {
1768		free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
1769		txq->ift_sds.ifsd_tso_map = NULL;
1770	}
1771	if (txq->ift_sds.ifsd_m != NULL) {
1772		free(txq->ift_sds.ifsd_m, M_IFLIB);
1773		txq->ift_sds.ifsd_m = NULL;
1774	}
1775	if (txq->ift_buf_tag != NULL) {
1776		bus_dma_tag_destroy(txq->ift_buf_tag);
1777		txq->ift_buf_tag = NULL;
1778	}
1779	if (txq->ift_tso_buf_tag != NULL) {
1780		bus_dma_tag_destroy(txq->ift_tso_buf_tag);
1781		txq->ift_tso_buf_tag = NULL;
1782	}
1783}
1784
1785static void
1786iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
1787{
1788	struct mbuf **mp;
1789
1790	mp = &txq->ift_sds.ifsd_m[i];
1791	if (*mp == NULL)
1792		return;
1793
1794	if (txq->ift_sds.ifsd_map != NULL) {
1795		bus_dmamap_sync(txq->ift_buf_tag,
1796		    txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
1797		bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
1798	}
1799	if (txq->ift_sds.ifsd_tso_map != NULL) {
1800		bus_dmamap_sync(txq->ift_tso_buf_tag,
1801		    txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
1802		bus_dmamap_unload(txq->ift_tso_buf_tag,
1803		    txq->ift_sds.ifsd_tso_map[i]);
1804	}
1805	m_free(*mp);
1806	DBG_COUNTER_INC(tx_frees);
1807	*mp = NULL;
1808}
1809
1810static int
1811iflib_txq_setup(iflib_txq_t txq)
1812{
1813	if_ctx_t ctx = txq->ift_ctx;
1814	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1815	if_shared_ctx_t sctx = ctx->ifc_sctx;
1816	iflib_dma_info_t di;
1817	int i;
1818
1819	/* Set number of descriptors available */
1820	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
1821	/* XXX make configurable */
1822	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
1823
1824	/* Reset indices */
1825	txq->ift_cidx_processed = 0;
1826	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
1827	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
1828
1829	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
1830		bzero((void *)di->idi_vaddr, di->idi_size);
1831
1832	IFDI_TXQ_SETUP(ctx, txq->ift_id);
1833	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
1834		bus_dmamap_sync(di->idi_tag, di->idi_map,
1835		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1836	return (0);
1837}
1838
1839/*********************************************************************
1840 *
1841 *  Allocate DMA resources for RX buffers as well as memory for the RX
1842 *  mbuf map, direct RX cluster pointer map and RX cluster bus address
1843 *  map.  RX DMA map, RX mbuf map, direct RX cluster pointer map and
1844 *  RX cluster map are kept in a iflib_sw_rx_desc_array structure.
1845 *  Since we use use one entry in iflib_sw_rx_desc_array per received
1846 *  packet, the maximum number of entries we'll need is equal to the
1847 *  number of hardware receive descriptors that we've allocated.
1848 *
1849 **********************************************************************/
1850static int
1851iflib_rxsd_alloc(iflib_rxq_t rxq)
1852{
1853	if_ctx_t ctx = rxq->ifr_ctx;
1854	if_shared_ctx_t sctx = ctx->ifc_sctx;
1855	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1856	device_t dev = ctx->ifc_dev;
1857	iflib_fl_t fl;
1858	int			err;
1859	int i;
1860
1861	MPASS(scctx->isc_nrxd[0] > 0);
1862	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
1863
1864	fl = rxq->ifr_fl;
1865	for (i = 0; i <  rxq->ifr_nfl; i++, fl++) {
1866		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
1867		/* Set up DMA tag for RX buffers. */
1868		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
1869					 1, 0,			/* alignment, bounds */
1870					 BUS_SPACE_MAXADDR,	/* lowaddr */
1871					 BUS_SPACE_MAXADDR,	/* highaddr */
1872					 NULL, NULL,		/* filter, filterarg */
1873					 sctx->isc_rx_maxsize,	/* maxsize */
1874					 sctx->isc_rx_nsegments,	/* nsegments */
1875					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
1876					 0,			/* flags */
1877					 NULL,			/* lockfunc */
1878					 NULL,			/* lockarg */
1879					 &fl->ifl_buf_tag);
1880		if (err) {
1881			device_printf(dev,
1882			    "Unable to allocate RX DMA tag: %d\n", err);
1883			goto fail;
1884		}
1885
1886		/* Allocate memory for the RX mbuf map. */
1887		if (!(fl->ifl_sds.ifsd_m =
1888		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
1889					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1890			device_printf(dev,
1891			    "Unable to allocate RX mbuf map memory\n");
1892			err = ENOMEM;
1893			goto fail;
1894		}
1895
1896		/* Allocate memory for the direct RX cluster pointer map. */
1897		if (!(fl->ifl_sds.ifsd_cl =
1898		      (caddr_t *) malloc(sizeof(caddr_t) *
1899					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1900			device_printf(dev,
1901			    "Unable to allocate RX cluster map memory\n");
1902			err = ENOMEM;
1903			goto fail;
1904		}
1905
1906		/* Allocate memory for the RX cluster bus address map. */
1907		if (!(fl->ifl_sds.ifsd_ba =
1908		      (bus_addr_t *) malloc(sizeof(bus_addr_t) *
1909					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1910			device_printf(dev,
1911			    "Unable to allocate RX bus address map memory\n");
1912			err = ENOMEM;
1913			goto fail;
1914		}
1915
1916		/*
1917		 * Create the DMA maps for RX buffers.
1918		 */
1919		if (!(fl->ifl_sds.ifsd_map =
1920		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1921			device_printf(dev,
1922			    "Unable to allocate RX buffer DMA map memory\n");
1923			err = ENOMEM;
1924			goto fail;
1925		}
1926		{
1927		int i;
1928		for (i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
1929			err = bus_dmamap_create(fl->ifl_buf_tag, 0,
1930			    &fl->ifl_sds.ifsd_map[i]);
1931			if (err != 0) {
1932				device_printf(dev, "Unable to create RX buffer DMA map\n");
1933				goto fail;
1934			}
1935		}
1936		}
1937	}
1938	return (0);
1939
1940fail:
1941	iflib_rx_structures_free(ctx);
1942	return (err);
1943}
1944
1945
1946/*
1947 * Internal service routines
1948 */
1949
1950struct rxq_refill_cb_arg {
1951	int               error;
1952	bus_dma_segment_t seg;
1953	int               nseg;
1954};
1955
1956static void
1957_rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
1958{
1959	struct rxq_refill_cb_arg *cb_arg = arg;
1960
1961	cb_arg->error = error;
1962	cb_arg->seg = segs[0];
1963	cb_arg->nseg = nseg;
1964}
1965
1966/**
1967 * _iflib_fl_refill - refill an rxq free-buffer list
1968 * @ctx: the iflib context
1969 * @fl: the free list to refill
1970 * @count: the number of new buffers to allocate
1971 *
1972 * (Re)populate an rxq free-buffer list with up to @count new packet buffers.
1973 * The caller must assure that @count does not exceed the queue's capacity.
1974 */
1975static void
1976_iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
1977{
1978	struct if_rxd_update iru;
1979	struct rxq_refill_cb_arg cb_arg;
1980	struct mbuf *m;
1981	caddr_t cl, *sd_cl;
1982	struct mbuf **sd_m;
1983	bus_dmamap_t *sd_map;
1984	bus_addr_t bus_addr, *sd_ba;
1985	int err, frag_idx, i, idx, n, pidx;
1986	qidx_t credits;
1987
1988	sd_m = fl->ifl_sds.ifsd_m;
1989	sd_map = fl->ifl_sds.ifsd_map;
1990	sd_cl = fl->ifl_sds.ifsd_cl;
1991	sd_ba = fl->ifl_sds.ifsd_ba;
1992	pidx = fl->ifl_pidx;
1993	idx = pidx;
1994	frag_idx = fl->ifl_fragidx;
1995	credits = fl->ifl_credits;
1996
1997	i = 0;
1998	n = count;
1999	MPASS(n > 0);
2000	MPASS(credits + n <= fl->ifl_size);
2001
2002	if (pidx < fl->ifl_cidx)
2003		MPASS(pidx + n <= fl->ifl_cidx);
2004	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
2005		MPASS(fl->ifl_gen == 0);
2006	if (pidx > fl->ifl_cidx)
2007		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
2008
2009	DBG_COUNTER_INC(fl_refills);
2010	if (n > 8)
2011		DBG_COUNTER_INC(fl_refills_large);
2012	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
2013	while (n--) {
2014		/*
2015		 * We allocate an uninitialized mbuf + cluster, mbuf is
2016		 * initialized after rx.
2017		 *
2018		 * If the cluster is still set then we know a minimum sized packet was received
2019		 */
2020		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
2021		    &frag_idx);
2022		if (frag_idx < 0)
2023			bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
2024		MPASS(frag_idx >= 0);
2025		if ((cl = sd_cl[frag_idx]) == NULL) {
2026			if ((cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
2027				break;
2028
2029			cb_arg.error = 0;
2030			MPASS(sd_map != NULL);
2031			err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
2032			    cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
2033			    BUS_DMA_NOWAIT);
2034			if (err != 0 || cb_arg.error) {
2035				/*
2036				 * !zone_pack ?
2037				 */
2038#ifndef __HAIKU__
2039				if (fl->ifl_zone == zone_pack)
2040					uma_zfree(fl->ifl_zone, cl);
2041#endif
2042				break;
2043			}
2044
2045			sd_ba[frag_idx] =  bus_addr = cb_arg.seg.ds_addr;
2046			sd_cl[frag_idx] = cl;
2047#if MEMORY_LOGGING
2048			fl->ifl_cl_enqueued++;
2049#endif
2050		} else {
2051			bus_addr = sd_ba[frag_idx];
2052		}
2053		bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
2054		    BUS_DMASYNC_PREREAD);
2055
2056		if (sd_m[frag_idx] == NULL) {
2057			if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
2058				break;
2059			}
2060			sd_m[frag_idx] = m;
2061		}
2062		bit_set(fl->ifl_rx_bitmap, frag_idx);
2063#if MEMORY_LOGGING
2064		fl->ifl_m_enqueued++;
2065#endif
2066
2067		DBG_COUNTER_INC(rx_allocs);
2068		fl->ifl_rxd_idxs[i] = frag_idx;
2069		fl->ifl_bus_addrs[i] = bus_addr;
2070		fl->ifl_vm_addrs[i] = cl;
2071		credits++;
2072		i++;
2073		MPASS(credits <= fl->ifl_size);
2074		if (++idx == fl->ifl_size) {
2075			fl->ifl_gen = 1;
2076			idx = 0;
2077		}
2078		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
2079			iru.iru_pidx = pidx;
2080			iru.iru_count = i;
2081			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
2082			i = 0;
2083			pidx = idx;
2084			fl->ifl_pidx = idx;
2085			fl->ifl_credits = credits;
2086		}
2087	}
2088
2089	if (i) {
2090		iru.iru_pidx = pidx;
2091		iru.iru_count = i;
2092		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
2093		fl->ifl_pidx = idx;
2094		fl->ifl_credits = credits;
2095	}
2096	DBG_COUNTER_INC(rxd_flush);
2097	if (fl->ifl_pidx == 0)
2098		pidx = fl->ifl_size - 1;
2099	else
2100		pidx = fl->ifl_pidx - 1;
2101
2102	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2103	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2104	ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx);
2105	fl->ifl_fragidx = frag_idx;
2106}
2107
2108static __inline void
2109__iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max)
2110{
2111	/* we avoid allowing pidx to catch up with cidx as it confuses ixl */
2112	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
2113#ifdef INVARIANTS
2114	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
2115#endif
2116
2117	MPASS(fl->ifl_credits <= fl->ifl_size);
2118	MPASS(reclaimable == delta);
2119
2120	if (reclaimable > 0)
2121		_iflib_fl_refill(ctx, fl, min(max, reclaimable));
2122}
2123
2124uint8_t
2125iflib_in_detach(if_ctx_t ctx)
2126{
2127	bool in_detach;
2128	STATE_LOCK(ctx);
2129	in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
2130	STATE_UNLOCK(ctx);
2131	return (in_detach);
2132}
2133
2134static void
2135iflib_fl_bufs_free(iflib_fl_t fl)
2136{
2137	iflib_dma_info_t idi = fl->ifl_ifdi;
2138	bus_dmamap_t sd_map;
2139	uint32_t i;
2140
2141	for (i = 0; i < fl->ifl_size; i++) {
2142		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
2143		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
2144
2145		if (*sd_cl != NULL) {
2146			sd_map = fl->ifl_sds.ifsd_map[i];
2147			bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
2148			    BUS_DMASYNC_POSTREAD);
2149			bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
2150			if (*sd_cl != NULL) {
2151#ifndef __HAIKU__
2152				uma_zfree(fl->ifl_zone, *sd_cl);
2153#else
2154				struct mbuf* mb = m_get(0, MT_DATA);
2155				m_cljset(mb, *sd_cl, fl->ifl_cltype);
2156				m_free(mb);
2157#endif
2158			}
2159			// XXX: Should this get moved out?
2160			if (iflib_in_detach(fl->ifl_rxq->ifr_ctx))
2161				bus_dmamap_destroy(fl->ifl_buf_tag, sd_map);
2162			if (*sd_m != NULL) {
2163				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
2164#ifndef __HAIKU__
2165				uma_zfree(zone_mbuf, *sd_m);
2166#else
2167				m_free(*sd_m);
2168#endif
2169			}
2170		} else {
2171			MPASS(*sd_cl == NULL);
2172			MPASS(*sd_m == NULL);
2173		}
2174#if MEMORY_LOGGING
2175		fl->ifl_m_dequeued++;
2176		fl->ifl_cl_dequeued++;
2177#endif
2178		*sd_cl = NULL;
2179		*sd_m = NULL;
2180	}
2181#ifdef INVARIANTS
2182	for (i = 0; i < fl->ifl_size; i++) {
2183		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
2184		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
2185	}
2186#endif
2187	/*
2188	 * Reset free list values
2189	 */
2190	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
2191	bzero(idi->idi_vaddr, idi->idi_size);
2192}
2193
2194/*********************************************************************
2195 *
2196 *  Initialize a receive ring and its buffers.
2197 *
2198 **********************************************************************/
2199static int
2200iflib_fl_setup(iflib_fl_t fl)
2201{
2202	iflib_rxq_t rxq = fl->ifl_rxq;
2203	if_ctx_t ctx = rxq->ifr_ctx;
2204
2205	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
2206	/*
2207	** Free current RX buffer structs and their mbufs
2208	*/
2209	iflib_fl_bufs_free(fl);
2210	/* Now replenish the mbufs */
2211	MPASS(fl->ifl_credits == 0);
2212	fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
2213	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
2214		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
2215	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
2216#ifndef __HAIKU__
2217	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
2218#endif
2219
2220
2221	/* avoid pre-allocating zillions of clusters to an idle card
2222	 * potentially speeding up attach
2223	 */
2224	_iflib_fl_refill(ctx, fl, min(128, fl->ifl_size));
2225	MPASS(min(128, fl->ifl_size) == fl->ifl_credits);
2226	if (min(128, fl->ifl_size) != fl->ifl_credits)
2227		return (ENOBUFS);
2228	/*
2229	 * handle failure
2230	 */
2231	MPASS(rxq != NULL);
2232	MPASS(fl->ifl_ifdi != NULL);
2233	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2234	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2235	return (0);
2236}
2237
2238/*********************************************************************
2239 *
2240 *  Free receive ring data structures
2241 *
2242 **********************************************************************/
2243static void
2244iflib_rx_sds_free(iflib_rxq_t rxq)
2245{
2246	iflib_fl_t fl;
2247	int i, j;
2248
2249	if (rxq->ifr_fl != NULL) {
2250		for (i = 0; i < rxq->ifr_nfl; i++) {
2251			fl = &rxq->ifr_fl[i];
2252			if (fl->ifl_buf_tag != NULL) {
2253				if (fl->ifl_sds.ifsd_map != NULL) {
2254					for (j = 0; j < fl->ifl_size; j++) {
2255						if (fl->ifl_sds.ifsd_map[j] ==
2256						    NULL)
2257							continue;
2258						bus_dmamap_sync(
2259						    fl->ifl_buf_tag,
2260						    fl->ifl_sds.ifsd_map[j],
2261						    BUS_DMASYNC_POSTREAD);
2262						bus_dmamap_unload(
2263						    fl->ifl_buf_tag,
2264						    fl->ifl_sds.ifsd_map[j]);
2265					}
2266				}
2267				bus_dma_tag_destroy(fl->ifl_buf_tag);
2268				fl->ifl_buf_tag = NULL;
2269			}
2270			free(fl->ifl_sds.ifsd_m, M_IFLIB);
2271			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
2272			free(fl->ifl_sds.ifsd_ba, M_IFLIB);
2273			free(fl->ifl_sds.ifsd_map, M_IFLIB);
2274			fl->ifl_sds.ifsd_m = NULL;
2275			fl->ifl_sds.ifsd_cl = NULL;
2276			fl->ifl_sds.ifsd_ba = NULL;
2277			fl->ifl_sds.ifsd_map = NULL;
2278		}
2279		free(rxq->ifr_fl, M_IFLIB);
2280		rxq->ifr_fl = NULL;
2281		rxq->ifr_cq_cidx = 0;
2282	}
2283}
2284
2285/*
2286 * Timer routine
2287 */
2288static void
2289iflib_timer(void *arg)
2290{
2291	iflib_txq_t txq = arg;
2292	if_ctx_t ctx = txq->ift_ctx;
2293	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2294	uint64_t this_tick = ticks;
2295	uint32_t reset_on = hz / 2;
2296
2297	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
2298		return;
2299
2300	/*
2301	** Check on the state of the TX queue(s), this
2302	** can be done without the lock because its RO
2303	** and the HUNG state will be static if set.
2304	*/
2305	if (this_tick - txq->ift_last_timer_tick >= hz / 2) {
2306		txq->ift_last_timer_tick = this_tick;
2307		IFDI_TIMER(ctx, txq->ift_id);
2308		if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
2309		    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
2310		     (sctx->isc_pause_frames == 0)))
2311			goto hung;
2312
2313		if (ifmp_ring_is_stalled(txq->ift_br))
2314			txq->ift_qstatus = IFLIB_QUEUE_HUNG;
2315		txq->ift_cleaned_prev = txq->ift_cleaned;
2316	}
2317#ifdef DEV_NETMAP
2318	if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP)
2319		iflib_netmap_timer_adjust(ctx, txq, &reset_on);
2320#endif
2321	/* handle any laggards */
2322	if (txq->ift_db_pending)
2323		GROUPTASK_ENQUEUE(&txq->ift_task);
2324
2325	sctx->isc_pause_frames = 0;
2326	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)
2327		callout_reset_on(&txq->ift_timer, reset_on, iflib_timer, txq, txq->ift_timer.c_cpu);
2328	return;
2329
2330 hung:
2331	device_printf(ctx->ifc_dev,
2332	    "Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n",
2333	    txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
2334	STATE_LOCK(ctx);
2335	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2336	ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET);
2337	iflib_admin_intr_deferred(ctx);
2338	STATE_UNLOCK(ctx);
2339}
2340
2341static void
2342iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
2343{
2344	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2345
2346	/*
2347	 * XXX don't set the max_frame_size to larger
2348	 * than the hardware can handle
2349	 */
2350	if (sctx->isc_max_frame_size <= MCLBYTES)
2351		ctx->ifc_rx_mbuf_sz = MCLBYTES;
2352	else
2353		ctx->ifc_rx_mbuf_sz = MJUMPAGESIZE;
2354}
2355
2356uint32_t
2357iflib_get_rx_mbuf_sz(if_ctx_t ctx)
2358{
2359
2360	return (ctx->ifc_rx_mbuf_sz);
2361}
2362
2363static void
2364iflib_init_locked(if_ctx_t ctx)
2365{
2366	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2367	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2368	if_t ifp = ctx->ifc_ifp;
2369	iflib_fl_t fl;
2370	iflib_txq_t txq;
2371	iflib_rxq_t rxq;
2372	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
2373
2374	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2375	IFDI_INTR_DISABLE(ctx);
2376
2377	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
2378	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
2379	/* Set hardware offload abilities */
2380	if_clearhwassist(ifp);
2381	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
2382		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
2383	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
2384		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
2385	if (if_getcapenable(ifp) & IFCAP_TSO4)
2386		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2387	if (if_getcapenable(ifp) & IFCAP_TSO6)
2388		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2389
2390	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
2391		CALLOUT_LOCK(txq);
2392		callout_stop(&txq->ift_timer);
2393		CALLOUT_UNLOCK(txq);
2394		iflib_netmap_txq_init(ctx, txq);
2395	}
2396
2397	/*
2398	 * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
2399	 * that drivers can use the value when setting up the hardware receive
2400	 * buffers.
2401	 */
2402	iflib_calc_rx_mbuf_sz(ctx);
2403
2404#ifdef INVARIANTS
2405	i = if_getdrvflags(ifp);
2406#endif
2407	IFDI_INIT(ctx);
2408	MPASS(if_getdrvflags(ifp) == i);
2409	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
2410		/* XXX this should really be done on a per-queue basis */
2411		if (if_getcapenable(ifp) & IFCAP_NETMAP) {
2412			MPASS(rxq->ifr_id == i);
2413			iflib_netmap_rxq_init(ctx, rxq);
2414			continue;
2415		}
2416		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
2417			if (iflib_fl_setup(fl)) {
2418				device_printf(ctx->ifc_dev,
2419				    "setting up free list %d failed - "
2420				    "check cluster settings\n", j);
2421				goto done;
2422			}
2423		}
2424	}
2425done:
2426	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
2427	IFDI_INTR_ENABLE(ctx);
2428	txq = ctx->ifc_txqs;
2429	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
2430		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq,
2431			txq->ift_timer.c_cpu);
2432}
2433
2434static int
2435iflib_media_change(if_t ifp)
2436{
2437	if_ctx_t ctx = if_getsoftc(ifp);
2438	int err;
2439
2440	CTX_LOCK(ctx);
2441	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
2442		iflib_init_locked(ctx);
2443	CTX_UNLOCK(ctx);
2444	return (err);
2445}
2446
2447static void
2448iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
2449{
2450	if_ctx_t ctx = if_getsoftc(ifp);
2451
2452	CTX_LOCK(ctx);
2453	IFDI_UPDATE_ADMIN_STATUS(ctx);
2454	IFDI_MEDIA_STATUS(ctx, ifmr);
2455	CTX_UNLOCK(ctx);
2456}
2457
2458void
2459iflib_stop(if_ctx_t ctx)
2460{
2461	iflib_txq_t txq = ctx->ifc_txqs;
2462	iflib_rxq_t rxq = ctx->ifc_rxqs;
2463	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2464	if_shared_ctx_t sctx = ctx->ifc_sctx;
2465	iflib_dma_info_t di;
2466	iflib_fl_t fl;
2467	int i, j;
2468
2469	/* Tell the stack that the interface is no longer active */
2470	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2471
2472	IFDI_INTR_DISABLE(ctx);
2473	DELAY(1000);
2474	IFDI_STOP(ctx);
2475	DELAY(1000);
2476
2477	iflib_debug_reset();
2478	/* Wait for current tx queue users to exit to disarm watchdog timer. */
2479	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
2480		/* make sure all transmitters have completed before proceeding XXX */
2481
2482		CALLOUT_LOCK(txq);
2483		callout_stop(&txq->ift_timer);
2484		CALLOUT_UNLOCK(txq);
2485
2486		/* clean any enqueued buffers */
2487		iflib_ifmp_purge(txq);
2488		/* Free any existing tx buffers. */
2489		for (j = 0; j < txq->ift_size; j++) {
2490			iflib_txsd_free(ctx, txq, j);
2491		}
2492		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
2493		txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
2494		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
2495		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
2496		txq->ift_pullups = 0;
2497		ifmp_ring_reset_stats(txq->ift_br);
2498		for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
2499			bzero((void *)di->idi_vaddr, di->idi_size);
2500	}
2501	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
2502		/* make sure all transmitters have completed before proceeding XXX */
2503
2504		rxq->ifr_cq_cidx = 0;
2505		for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
2506			bzero((void *)di->idi_vaddr, di->idi_size);
2507		/* also resets the free lists pidx/cidx */
2508		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
2509			iflib_fl_bufs_free(fl);
2510	}
2511}
2512
2513static inline caddr_t
2514calc_next_rxd(iflib_fl_t fl, int cidx)
2515{
2516	qidx_t size;
2517	int nrxd;
2518	caddr_t start, end, cur, next;
2519
2520	nrxd = fl->ifl_size;
2521	size = fl->ifl_rxd_size;
2522	start = fl->ifl_ifdi->idi_vaddr;
2523
2524	if (__predict_false(size == 0))
2525		return (start);
2526	cur = start + size*cidx;
2527	end = start + size*nrxd;
2528	next = CACHE_PTR_NEXT(cur);
2529	return (next < end ? next : start);
2530}
2531
2532static inline void
2533prefetch_pkts(iflib_fl_t fl, int cidx)
2534{
2535	int nextptr;
2536	int nrxd = fl->ifl_size;
2537	caddr_t next_rxd;
2538
2539
2540	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
2541	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
2542	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
2543	next_rxd = calc_next_rxd(fl, cidx);
2544	prefetch(next_rxd);
2545	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
2546	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
2547	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
2548	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
2549	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
2550	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
2551	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
2552	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
2553}
2554
2555static struct mbuf *
2556rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
2557    int *pf_rv, if_rxd_info_t ri)
2558{
2559	bus_dmamap_t map;
2560	iflib_fl_t fl;
2561	caddr_t payload;
2562	struct mbuf *m;
2563	int flid, cidx, len, next;
2564
2565	map = NULL;
2566	flid = irf->irf_flid;
2567	cidx = irf->irf_idx;
2568	fl = &rxq->ifr_fl[flid];
2569	sd->ifsd_fl = fl;
2570	sd->ifsd_cidx = cidx;
2571	m = fl->ifl_sds.ifsd_m[cidx];
2572	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
2573	fl->ifl_credits--;
2574#if MEMORY_LOGGING
2575	fl->ifl_m_dequeued++;
2576#endif
2577	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
2578		prefetch_pkts(fl, cidx);
2579	next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
2580	prefetch(&fl->ifl_sds.ifsd_map[next]);
2581	map = fl->ifl_sds.ifsd_map[cidx];
2582	next = (cidx + CACHE_LINE_SIZE) & (fl->ifl_size-1);
2583
2584	/* not valid assert if bxe really does SGE from non-contiguous elements */
2585	MPASS(fl->ifl_cidx == cidx);
2586	bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
2587
2588#ifndef __HAIKU__
2589	if (rxq->pfil != NULL && PFIL_HOOKED_IN(rxq->pfil) && pf_rv != NULL) {
2590		payload  = *sd->ifsd_cl;
2591		payload +=  ri->iri_pad;
2592		len = ri->iri_len - ri->iri_pad;
2593		*pf_rv = pfil_run_hooks(rxq->pfil, payload, ri->iri_ifp,
2594		    len | PFIL_MEMPTR | PFIL_IN, NULL);
2595		switch (*pf_rv) {
2596		case PFIL_DROPPED:
2597		case PFIL_CONSUMED:
2598			/*
2599			 * The filter ate it.  Everything is recycled.
2600			 */
2601			m = NULL;
2602			unload = 0;
2603			break;
2604		case PFIL_REALLOCED:
2605			/*
2606			 * The filter copied it.  Everything is recycled.
2607			 */
2608			m = pfil_mem2mbuf(payload);
2609			unload = 0;
2610			break;
2611		case PFIL_PASS:
2612			/*
2613			 * Filter said it was OK, so receive like
2614			 * normal
2615			 */
2616			fl->ifl_sds.ifsd_m[cidx] = NULL;
2617			break;
2618		default:
2619			MPASS(0);
2620		}
2621	} else
2622#endif
2623	{
2624		fl->ifl_sds.ifsd_m[cidx] = NULL;
2625		*pf_rv = PFIL_PASS;
2626	}
2627
2628	if (unload)
2629		bus_dmamap_unload(fl->ifl_buf_tag, map);
2630	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
2631	if (__predict_false(fl->ifl_cidx == 0))
2632		fl->ifl_gen = 0;
2633	bit_clear(fl->ifl_rx_bitmap, cidx);
2634	return (m);
2635}
2636
2637static struct mbuf *
2638assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd, int *pf_rv)
2639{
2640	struct mbuf *m, *mh, *mt;
2641	caddr_t cl;
2642	int  *pf_rv_ptr, flags, i, padlen;
2643	bool consumed;
2644
2645	i = 0;
2646	mh = NULL;
2647	consumed = false;
2648	*pf_rv = PFIL_PASS;
2649	pf_rv_ptr = pf_rv;
2650	do {
2651		m = rxd_frag_to_sd(rxq, &ri->iri_frags[i], !consumed, sd,
2652		    pf_rv_ptr, ri);
2653
2654		MPASS(*sd->ifsd_cl != NULL);
2655
2656		/*
2657		 * Exclude zero-length frags & frags from
2658		 * packets the filter has consumed or dropped
2659		 */
2660		if (ri->iri_frags[i].irf_len == 0 || consumed ||
2661#ifndef __HAIKU__
2662		    *pf_rv == PFIL_CONSUMED || *pf_rv == PFIL_DROPPED
2663#else
2664			0
2665#endif
2666		     ) {
2667			if (mh == NULL) {
2668				/* everything saved here */
2669				consumed = true;
2670				pf_rv_ptr = NULL;
2671				continue;
2672			}
2673			/* XXX we can save the cluster here, but not the mbuf */
2674			m_init(m, M_NOWAIT, MT_DATA, 0);
2675			m_free(m);
2676			continue;
2677		}
2678		if (mh == NULL) {
2679			flags = M_PKTHDR|M_EXT;
2680			mh = mt = m;
2681			padlen = ri->iri_pad;
2682		} else {
2683			flags = M_EXT;
2684			mt->m_next = m;
2685			mt = m;
2686			/* assuming padding is only on the first fragment */
2687			padlen = 0;
2688		}
2689		cl = *sd->ifsd_cl;
2690		*sd->ifsd_cl = NULL;
2691
2692		/* Can these two be made one ? */
2693		m_init(m, M_NOWAIT, MT_DATA, flags);
2694		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
2695		/*
2696		 * These must follow m_init and m_cljset
2697		 */
2698		m->m_data += padlen;
2699		ri->iri_len -= padlen;
2700		m->m_len = ri->iri_frags[i].irf_len;
2701	} while (++i < ri->iri_nfrags);
2702
2703	return (mh);
2704}
2705
2706/*
2707 * Process one software descriptor
2708 */
2709static struct mbuf *
2710iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
2711{
2712	struct if_rxsd sd;
2713	struct mbuf *m;
2714	int pf_rv;
2715
2716	/* should I merge this back in now that the two paths are basically duplicated? */
2717	if (ri->iri_nfrags == 1 &&
2718	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
2719		m = rxd_frag_to_sd(rxq, &ri->iri_frags[0], false, &sd,
2720		    &pf_rv, ri);
2721		if (pf_rv != PFIL_PASS
2722#ifndef __HAIKU__
2723		        && pf_rv != PFIL_REALLOCED
2724#endif
2725		        )
2726			return (m);
2727		if (pf_rv == PFIL_PASS) {
2728			m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
2729#ifndef __NO_STRICT_ALIGNMENT
2730			if (!IP_ALIGNED(m))
2731				m->m_data += 2;
2732#endif
2733			memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
2734			m->m_len = ri->iri_frags[0].irf_len;
2735		}
2736	} else {
2737		m = assemble_segments(rxq, ri, &sd, &pf_rv);
2738		if (pf_rv != PFIL_PASS
2739#ifndef __HAIKU__
2740		        && pf_rv != PFIL_REALLOCED
2741#endif
2742				)
2743			return (m);
2744	}
2745	m->m_pkthdr.len = ri->iri_len;
2746	m->m_pkthdr.rcvif = ri->iri_ifp;
2747	m->m_flags |= ri->iri_flags;
2748	m->m_pkthdr.ether_vtag = ri->iri_vtag;
2749	m->m_pkthdr.flowid = ri->iri_flowid;
2750	M_HASHTYPE_SET(m, ri->iri_rsstype);
2751	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
2752	m->m_pkthdr.csum_data = ri->iri_csum_data;
2753	return (m);
2754}
2755
2756#if defined(INET6) || defined(INET)
2757static void
2758iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
2759{
2760	CURVNET_SET(lc->ifp->if_vnet);
2761#if defined(INET6)
2762	*v6 = VNET(ip6_forwarding);
2763#endif
2764#if defined(INET)
2765	*v4 = VNET(ipforwarding);
2766#endif
2767	CURVNET_RESTORE();
2768}
2769
2770/*
2771 * Returns true if it's possible this packet could be LROed.
2772 * if it returns false, it is guaranteed that tcp_lro_rx()
2773 * would not return zero.
2774 */
2775static bool
2776iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
2777{
2778#ifndef __HAIKU__
2779	struct ether_header *eh;
2780
2781	eh = mtod(m, struct ether_header *);
2782	switch (eh->ether_type) {
2783#if defined(INET6)
2784		case htons(ETHERTYPE_IPV6):
2785			return (!v6_forwarding);
2786#endif
2787#if defined (INET)
2788		case htons(ETHERTYPE_IP):
2789			return (!v4_forwarding);
2790#endif
2791	}
2792#endif
2793
2794	return false;
2795}
2796#else
2797static void
2798iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
2799{
2800}
2801#endif
2802
2803static bool
2804iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
2805{
2806	if_t ifp;
2807	if_ctx_t ctx = rxq->ifr_ctx;
2808	if_shared_ctx_t sctx = ctx->ifc_sctx;
2809	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2810	int avail, i;
2811	qidx_t *cidxp;
2812	struct if_rxd_info ri;
2813	int err, budget_left, rx_bytes, rx_pkts;
2814	iflib_fl_t fl;
2815	int lro_enabled;
2816	bool v4_forwarding, v6_forwarding, lro_possible;
2817
2818	/*
2819	 * XXX early demux data packets so that if_input processing only handles
2820	 * acks in interrupt context
2821	 */
2822	struct mbuf *m, *mh, *mt, *mf;
2823
2824	lro_possible = v4_forwarding = v6_forwarding = false;
2825	ifp = ctx->ifc_ifp;
2826	mh = mt = NULL;
2827	MPASS(budget > 0);
2828	rx_pkts	= rx_bytes = 0;
2829	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
2830		cidxp = &rxq->ifr_cq_cidx;
2831	else
2832		cidxp = &rxq->ifr_fl[0].ifl_cidx;
2833	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
2834		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
2835			__iflib_fl_refill_lt(ctx, fl, budget + 8);
2836		DBG_COUNTER_INC(rx_unavail);
2837		return (false);
2838	}
2839
2840	/* pfil needs the vnet to be set */
2841	CURVNET_SET_QUIET(ifp->if_vnet);
2842	for (budget_left = budget; budget_left > 0 && avail > 0;) {
2843		if (__predict_false(!CTX_ACTIVE(ctx))) {
2844			DBG_COUNTER_INC(rx_ctx_inactive);
2845			break;
2846		}
2847		/*
2848		 * Reset client set fields to their default values
2849		 */
2850		rxd_info_zero(&ri);
2851		ri.iri_qsidx = rxq->ifr_id;
2852		ri.iri_cidx = *cidxp;
2853		ri.iri_ifp = ifp;
2854		ri.iri_frags = rxq->ifr_frags;
2855		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
2856
2857		if (err)
2858			goto err;
2859		rx_pkts += 1;
2860		rx_bytes += ri.iri_len;
2861		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
2862			*cidxp = ri.iri_cidx;
2863			/* Update our consumer index */
2864			/* XXX NB: shurd - check if this is still safe */
2865			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0])
2866				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
2867			/* was this only a completion queue message? */
2868			if (__predict_false(ri.iri_nfrags == 0))
2869				continue;
2870		}
2871		MPASS(ri.iri_nfrags != 0);
2872		MPASS(ri.iri_len != 0);
2873
2874		/* will advance the cidx on the corresponding free lists */
2875		m = iflib_rxd_pkt_get(rxq, &ri);
2876		avail--;
2877		budget_left--;
2878		if (avail == 0 && budget_left)
2879			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
2880
2881		if (__predict_false(m == NULL))
2882			continue;
2883
2884		/* imm_pkt: -- cxgb */
2885		if (mh == NULL)
2886			mh = mt = m;
2887		else {
2888			mt->m_nextpkt = m;
2889			mt = m;
2890		}
2891	}
2892	CURVNET_RESTORE();
2893	/* make sure that we can refill faster than drain */
2894	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
2895		__iflib_fl_refill_lt(ctx, fl, budget + 8);
2896
2897	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
2898#ifndef __HAIKU__
2899	if (lro_enabled)
2900		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
2901#endif
2902	mt = mf = NULL;
2903	while (mh != NULL) {
2904		m = mh;
2905		mh = mh->m_nextpkt;
2906		m->m_nextpkt = NULL;
2907#ifndef __NO_STRICT_ALIGNMENT
2908		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
2909			continue;
2910#endif
2911		rx_bytes += m->m_pkthdr.len;
2912		rx_pkts++;
2913#ifndef __HAIKU__
2914#if defined(INET6) || defined(INET)
2915		if (lro_enabled) {
2916			if (!lro_possible) {
2917				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
2918				if (lro_possible && mf != NULL) {
2919					ifp->if_input(ifp, mf);
2920					DBG_COUNTER_INC(rx_if_input);
2921					mt = mf = NULL;
2922				}
2923			}
2924			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
2925			    (CSUM_L4_CALC|CSUM_L4_VALID)) {
2926				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
2927					continue;
2928			}
2929		}
2930#endif
2931		if (lro_possible) {
2932			ifp->if_input(ifp, m);
2933			DBG_COUNTER_INC(rx_if_input);
2934			continue;
2935		}
2936#else /* __HAIKU __*/
2937		if (mf != NULL) {
2938			ifp->if_input(ifp, mf);
2939			DBG_COUNTER_INC(rx_if_input);
2940			mt = mf = NULL;
2941		}
2942		ifp->if_input(ifp, m);
2943		DBG_COUNTER_INC(rx_if_input);
2944		continue;
2945#endif
2946
2947		if (mf == NULL)
2948			mf = m;
2949		if (mt != NULL)
2950			mt->m_nextpkt = m;
2951		mt = m;
2952	}
2953	if (mf != NULL) {
2954		ifp->if_input(ifp, mf);
2955		DBG_COUNTER_INC(rx_if_input);
2956	}
2957
2958	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
2959	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
2960
2961	/*
2962	 * Flush any outstanding LRO work
2963	 */
2964#if defined(INET6) || defined(INET)
2965#ifndef __HAIKU__
2966	tcp_lro_flush_all(&rxq->ifr_lc);
2967#endif
2968#endif
2969	if (avail)
2970		return true;
2971	return (iflib_rxd_avail(ctx, rxq, *cidxp, 1));
2972err:
2973	STATE_LOCK(ctx);
2974	ctx->ifc_flags |= IFC_DO_RESET;
2975	iflib_admin_intr_deferred(ctx);
2976	STATE_UNLOCK(ctx);
2977	return (false);
2978}
2979
2980#define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
2981static inline qidx_t
2982txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
2983{
2984	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
2985	qidx_t minthresh = txq->ift_size / 8;
2986	if (in_use > 4*minthresh)
2987		return (notify_count);
2988	if (in_use > 2*minthresh)
2989		return (notify_count >> 1);
2990	if (in_use > minthresh)
2991		return (notify_count >> 3);
2992	return (0);
2993}
2994
2995static inline qidx_t
2996txq_max_rs_deferred(iflib_txq_t txq)
2997{
2998	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
2999	qidx_t minthresh = txq->ift_size / 8;
3000	if (txq->ift_in_use > 4*minthresh)
3001		return (notify_count);
3002	if (txq->ift_in_use > 2*minthresh)
3003		return (notify_count >> 1);
3004	if (txq->ift_in_use > minthresh)
3005		return (notify_count >> 2);
3006	return (2);
3007}
3008
3009#define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
3010#define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
3011
3012#define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
3013#define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
3014#define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
3015
3016/* forward compatibility for cxgb */
3017#define FIRST_QSET(ctx) 0
3018#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
3019#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
3020#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
3021#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
3022
3023/* XXX we should be setting this to something other than zero */
3024#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
3025#define	MAX_TX_DESC(ctx) max((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
3026    (ctx)->ifc_softc_ctx.isc_tx_nsegments)
3027
3028static inline bool
3029iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring, qidx_t in_use)
3030{
3031	qidx_t dbval, max;
3032	bool rang;
3033
3034	rang = false;
3035	max = TXQ_MAX_DB_DEFERRED(txq, in_use);
3036	if (ring || txq->ift_db_pending >= max) {
3037		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
3038		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3039		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
3040		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
3041		txq->ift_db_pending = txq->ift_npending = 0;
3042		rang = true;
3043	}
3044	return (rang);
3045}
3046
3047#ifdef PKT_DEBUG
3048static void
3049print_pkt(if_pkt_info_t pi)
3050{
3051	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
3052	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
3053	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
3054	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
3055	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
3056	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
3057}
3058#endif
3059
3060#define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
3061#define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
3062#define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
3063#define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
3064
3065static int
3066iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
3067{
3068	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
3069	struct ether_vlan_header *eh;
3070	struct mbuf *m;
3071
3072	m = *mp;
3073	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
3074	    M_WRITABLE(m) == 0) {
3075		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
3076			return (ENOMEM);
3077		} else {
3078			m_freem(*mp);
3079			DBG_COUNTER_INC(tx_frees);
3080			*mp = m;
3081		}
3082	}
3083
3084	/*
3085	 * Determine where frame payload starts.
3086	 * Jump over vlan headers if already present,
3087	 * helpful for QinQ too.
3088	 */
3089	if (__predict_false(m->m_len < sizeof(*eh))) {
3090		txq->ift_pullups++;
3091		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
3092			return (ENOMEM);
3093	}
3094	eh = mtod(m, struct ether_vlan_header *);
3095	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
3096		pi->ipi_etype = ntohs(eh->evl_proto);
3097		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3098	} else {
3099		pi->ipi_etype = ntohs(eh->evl_encap_proto);
3100		pi->ipi_ehdrlen = ETHER_HDR_LEN;
3101	}
3102
3103	switch (pi->ipi_etype) {
3104#ifdef INET
3105	case ETHERTYPE_IP:
3106	{
3107		struct mbuf *n;
3108		struct ip *ip = NULL;
3109		struct tcphdr *th = NULL;
3110		int minthlen;
3111
3112		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
3113		if (__predict_false(m->m_len < minthlen)) {
3114			/*
3115			 * if this code bloat is causing too much of a hit
3116			 * move it to a separate function and mark it noinline
3117			 */
3118			if (m->m_len == pi->ipi_ehdrlen) {
3119				n = m->m_next;
3120				MPASS(n);
3121				if (n->m_len >= sizeof(*ip))  {
3122					ip = (struct ip *)n->m_data;
3123					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3124						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3125				} else {
3126					txq->ift_pullups++;
3127					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
3128						return (ENOMEM);
3129					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3130				}
3131			} else {
3132				txq->ift_pullups++;
3133				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
3134					return (ENOMEM);
3135				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3136				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3137					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3138			}
3139		} else {
3140			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3141			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3142				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3143		}
3144		pi->ipi_ip_hlen = ip->ip_hl << 2;
3145		pi->ipi_ipproto = ip->ip_p;
3146		pi->ipi_flags |= IPI_TX_IPV4;
3147
3148		/* TCP checksum offload may require TCP header length */
3149		if (IS_TX_OFFLOAD4(pi)) {
3150			if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
3151				if (__predict_false(th == NULL)) {
3152					txq->ift_pullups++;
3153					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
3154						return (ENOMEM);
3155					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
3156				}
3157				pi->ipi_tcp_hflags = th->th_flags;
3158				pi->ipi_tcp_hlen = th->th_off << 2;
3159				pi->ipi_tcp_seq = th->th_seq;
3160			}
3161			if (IS_TSO4(pi)) {
3162				if (__predict_false(ip->ip_p != IPPROTO_TCP))
3163					return (ENXIO);
3164				/*
3165				 * TSO always requires hardware checksum offload.
3166				 */
3167				pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
3168				th->th_sum = in_pseudo(ip->ip_src.s_addr,
3169						       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
3170				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
3171				if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
3172					ip->ip_sum = 0;
3173					ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
3174				}
3175			}
3176		}
3177		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
3178                       ip->ip_sum = 0;
3179
3180		break;
3181	}
3182#endif
3183#ifdef INET6
3184	case ETHERTYPE_IPV6:
3185	{
3186		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
3187		struct tcphdr *th;
3188		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
3189
3190		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
3191			txq->ift_pullups++;
3192			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
3193				return (ENOMEM);
3194		}
3195		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
3196
3197		/* XXX-BZ this will go badly in case of ext hdrs. */
3198		pi->ipi_ipproto = ip6->ip6_nxt;
3199		pi->ipi_flags |= IPI_TX_IPV6;
3200
3201		/* TCP checksum offload may require TCP header length */
3202		if (IS_TX_OFFLOAD6(pi)) {
3203			if (pi->ipi_ipproto == IPPROTO_TCP) {
3204				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
3205					txq->ift_pullups++;
3206					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
3207						return (ENOMEM);
3208				}
3209				pi->ipi_tcp_hflags = th->th_flags;
3210				pi->ipi_tcp_hlen = th->th_off << 2;
3211				pi->ipi_tcp_seq = th->th_seq;
3212			}
3213			if (IS_TSO6(pi)) {
3214				if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
3215					return (ENXIO);
3216				/*
3217				 * TSO always requires hardware checksum offload.
3218				 */
3219				pi->ipi_csum_flags |= CSUM_IP6_TCP;
3220				th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
3221				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
3222			}
3223		}
3224		break;
3225	}
3226#endif
3227	default:
3228		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
3229		pi->ipi_ip_hlen = 0;
3230		break;
3231	}
3232	*mp = m;
3233
3234	return (0);
3235}
3236
3237/*
3238 * If dodgy hardware rejects the scatter gather chain we've handed it
3239 * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
3240 * m_defrag'd mbufs
3241 */
3242static __noinline struct mbuf *
3243iflib_remove_mbuf(iflib_txq_t txq)
3244{
3245	int ntxd, pidx;
3246	struct mbuf *m, **ifsd_m;
3247
3248	ifsd_m = txq->ift_sds.ifsd_m;
3249	ntxd = txq->ift_size;
3250	pidx = txq->ift_pidx & (ntxd - 1);
3251	ifsd_m = txq->ift_sds.ifsd_m;
3252	m = ifsd_m[pidx];
3253	ifsd_m[pidx] = NULL;
3254	bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
3255	if (txq->ift_sds.ifsd_tso_map != NULL)
3256		bus_dmamap_unload(txq->ift_tso_buf_tag,
3257		    txq->ift_sds.ifsd_tso_map[pidx]);
3258#if MEMORY_LOGGING
3259	txq->ift_dequeued++;
3260#endif
3261	return (m);
3262}
3263
3264static inline caddr_t
3265calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
3266{
3267	qidx_t size;
3268	int ntxd;
3269	caddr_t start, end, cur, next;
3270
3271	ntxd = txq->ift_size;
3272	size = txq->ift_txd_size[qid];
3273	start = txq->ift_ifdi[qid].idi_vaddr;
3274
3275	if (__predict_false(size == 0))
3276		return (start);
3277	cur = start + size*cidx;
3278	end = start + size*ntxd;
3279	next = CACHE_PTR_NEXT(cur);
3280	return (next < end ? next : start);
3281}
3282
3283/*
3284 * Pad an mbuf to ensure a minimum ethernet frame size.
3285 * min_frame_size is the frame size (less CRC) to pad the mbuf to
3286 */
3287static __noinline int
3288iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
3289{
3290	/*
3291	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
3292	 * and ARP message is the smallest common payload I can think of
3293	 */
3294	static char pad[18];	/* just zeros */
3295	int n;
3296	struct mbuf *new_head;
3297
3298	if (!M_WRITABLE(*m_head)) {
3299		new_head = m_dup(*m_head, M_NOWAIT);
3300		if (new_head == NULL) {
3301			m_freem(*m_head);
3302			device_printf(dev, "cannot pad short frame, m_dup() failed");
3303			DBG_COUNTER_INC(encap_pad_mbuf_fail);
3304			DBG_COUNTER_INC(tx_frees);
3305			return ENOMEM;
3306		}
3307		m_freem(*m_head);
3308		*m_head = new_head;
3309	}
3310
3311	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
3312	     n > 0; n -= sizeof(pad))
3313		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
3314			break;
3315
3316	if (n > 0) {
3317		m_freem(*m_head);
3318		device_printf(dev, "cannot pad short frame\n");
3319		DBG_COUNTER_INC(encap_pad_mbuf_fail);
3320		DBG_COUNTER_INC(tx_frees);
3321		return (ENOBUFS);
3322	}
3323
3324	return 0;
3325}
3326
3327static int
3328iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
3329{
3330	if_ctx_t		ctx;
3331	if_shared_ctx_t		sctx;
3332	if_softc_ctx_t		scctx;
3333	bus_dma_tag_t		buf_tag;
3334	bus_dma_segment_t	*segs;
3335	struct mbuf		*m_head, **ifsd_m;
3336	void			*next_txd;
3337	bus_dmamap_t		map;
3338	struct if_pkt_info	pi;
3339	int remap = 0;
3340	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
3341
3342	ctx = txq->ift_ctx;
3343	sctx = ctx->ifc_sctx;
3344	scctx = &ctx->ifc_softc_ctx;
3345	segs = txq->ift_segs;
3346	ntxd = txq->ift_size;
3347	m_head = *m_headp;
3348	map = NULL;
3349
3350	/*
3351	 * If we're doing TSO the next descriptor to clean may be quite far ahead
3352	 */
3353	cidx = txq->ift_cidx;
3354	pidx = txq->ift_pidx;
3355	if (ctx->ifc_flags & IFC_PREFETCH) {
3356		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
3357		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
3358			next_txd = calc_next_txd(txq, cidx, 0);
3359			prefetch(next_txd);
3360		}
3361
3362		/* prefetch the next cache line of mbuf pointers and flags */
3363		prefetch(&txq->ift_sds.ifsd_m[next]);
3364		prefetch(&txq->ift_sds.ifsd_map[next]);
3365		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
3366	}
3367	map = txq->ift_sds.ifsd_map[pidx];
3368	ifsd_m = txq->ift_sds.ifsd_m;
3369
3370	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3371		buf_tag = txq->ift_tso_buf_tag;
3372		max_segs = scctx->isc_tx_tso_segments_max;
3373		map = txq->ift_sds.ifsd_tso_map[pidx];
3374		MPASS(buf_tag != NULL);
3375		MPASS(max_segs > 0);
3376	} else {
3377		buf_tag = txq->ift_buf_tag;
3378		max_segs = scctx->isc_tx_nsegments;
3379		map = txq->ift_sds.ifsd_map[pidx];
3380	}
3381	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
3382	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
3383		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
3384		if (err) {
3385			DBG_COUNTER_INC(encap_txd_encap_fail);
3386			return err;
3387		}
3388	}
3389	m_head = *m_headp;
3390
3391	pkt_info_zero(&pi);
3392	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
3393	pi.ipi_pidx = pidx;
3394	pi.ipi_qsidx = txq->ift_id;
3395	pi.ipi_len = m_head->m_pkthdr.len;
3396	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
3397	pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0;
3398
3399	/* deliberate bitwise OR to make one condition */
3400	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
3401		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
3402			DBG_COUNTER_INC(encap_txd_encap_fail);
3403			return (err);
3404		}
3405		m_head = *m_headp;
3406	}
3407
3408retry:
3409	err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
3410	    BUS_DMA_NOWAIT);
3411defrag:
3412	if (__predict_false(err)) {
3413		switch (err) {
3414		case EFBIG:
3415			/* try collapse once and defrag once */
3416			if (remap == 0) {
3417				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
3418				/* try defrag if collapsing fails */
3419				if (m_head == NULL)
3420					remap++;
3421			}
3422			if (remap == 1) {
3423				txq->ift_mbuf_defrag++;
3424				m_head = m_defrag(*m_headp, M_NOWAIT);
3425			}
3426			/*
3427			 * remap should never be >1 unless bus_dmamap_load_mbuf_sg
3428			 * failed to map an mbuf that was run through m_defrag
3429			 */
3430			MPASS(remap <= 1);
3431			if (__predict_false(m_head == NULL || remap > 1))
3432				goto defrag_failed;
3433			remap++;
3434			*m_headp = m_head;
3435			goto retry;
3436			break;
3437		case ENOMEM:
3438			txq->ift_no_tx_dma_setup++;
3439			break;
3440		default:
3441			txq->ift_no_tx_dma_setup++;
3442			m_freem(*m_headp);
3443			DBG_COUNTER_INC(tx_frees);
3444			*m_headp = NULL;
3445			break;
3446		}
3447		txq->ift_map_failed++;
3448		DBG_COUNTER_INC(encap_load_mbuf_fail);
3449		DBG_COUNTER_INC(encap_txd_encap_fail);
3450		return (err);
3451	}
3452	ifsd_m[pidx] = m_head;
3453	/*
3454	 * XXX assumes a 1 to 1 relationship between segments and
3455	 *        descriptors - this does not hold true on all drivers, e.g.
3456	 *        cxgb
3457	 */
3458	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
3459		txq->ift_no_desc_avail++;
3460		bus_dmamap_unload(buf_tag, map);
3461		DBG_COUNTER_INC(encap_txq_avail_fail);
3462		DBG_COUNTER_INC(encap_txd_encap_fail);
3463		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
3464			GROUPTASK_ENQUEUE(&txq->ift_task);
3465		return (ENOBUFS);
3466	}
3467	/*
3468	 * On Intel cards we can greatly reduce the number of TX interrupts
3469	 * we see by only setting report status on every Nth descriptor.
3470	 * However, this also means that the driver will need to keep track
3471	 * of the descriptors that RS was set on to check them for the DD bit.
3472	 */
3473	txq->ift_rs_pending += nsegs + 1;
3474	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
3475	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
3476		pi.ipi_flags |= IPI_TX_INTR;
3477		txq->ift_rs_pending = 0;
3478	}
3479
3480	pi.ipi_segs = segs;
3481	pi.ipi_nsegs = nsegs;
3482
3483	MPASS(pidx >= 0 && pidx < txq->ift_size);
3484#ifdef PKT_DEBUG
3485	print_pkt(&pi);
3486#endif
3487	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
3488		bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
3489		DBG_COUNTER_INC(tx_encap);
3490		MPASS(pi.ipi_new_pidx < txq->ift_size);
3491
3492		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
3493		if (pi.ipi_new_pidx < pi.ipi_pidx) {
3494			ndesc += txq->ift_size;
3495			txq->ift_gen = 1;
3496		}
3497		/*
3498		 * drivers can need as many as
3499		 * two sentinels
3500		 */
3501		MPASS(ndesc <= pi.ipi_nsegs + 2);
3502		MPASS(pi.ipi_new_pidx != pidx);
3503		MPASS(ndesc > 0);
3504		txq->ift_in_use += ndesc;
3505
3506		/*
3507		 * We update the last software descriptor again here because there may
3508		 * be a sentinel and/or there may be more mbufs than segments
3509		 */
3510		txq->ift_pidx = pi.ipi_new_pidx;
3511		txq->ift_npending += pi.ipi_ndescs;
3512	} else {
3513		*m_headp = m_head = iflib_remove_mbuf(txq);
3514		if (err == EFBIG) {
3515			txq->ift_txd_encap_efbig++;
3516			if (remap < 2) {
3517				remap = 1;
3518				goto defrag;
3519			}
3520		}
3521		goto defrag_failed;
3522	}
3523	/*
3524	 * err can't possibly be non-zero here, so we don't neet to test it
3525	 * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
3526	 */
3527	return (err);
3528
3529defrag_failed:
3530	txq->ift_mbuf_defrag_failed++;
3531	txq->ift_map_failed++;
3532	m_freem(*m_headp);
3533	DBG_COUNTER_INC(tx_frees);
3534	*m_headp = NULL;
3535	DBG_COUNTER_INC(encap_txd_encap_fail);
3536	return (ENOMEM);
3537}
3538
3539static void
3540iflib_tx_desc_free(iflib_txq_t txq, int n)
3541{
3542	uint32_t qsize, cidx, mask, gen;
3543	struct mbuf *m, **ifsd_m;
3544	bool do_prefetch;
3545
3546	cidx = txq->ift_cidx;
3547	gen = txq->ift_gen;
3548	qsize = txq->ift_size;
3549	mask = qsize-1;
3550	ifsd_m = txq->ift_sds.ifsd_m;
3551	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
3552
3553	while (n-- > 0) {
3554		if (do_prefetch) {
3555			prefetch(ifsd_m[(cidx + 3) & mask]);
3556			prefetch(ifsd_m[(cidx + 4) & mask]);
3557		}
3558		if ((m = ifsd_m[cidx]) != NULL) {
3559			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
3560			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
3561				bus_dmamap_sync(txq->ift_tso_buf_tag,
3562				    txq->ift_sds.ifsd_tso_map[cidx],
3563				    BUS_DMASYNC_POSTWRITE);
3564				bus_dmamap_unload(txq->ift_tso_buf_tag,
3565				    txq->ift_sds.ifsd_tso_map[cidx]);
3566			} else {
3567				bus_dmamap_sync(txq->ift_buf_tag,
3568				    txq->ift_sds.ifsd_map[cidx],
3569				    BUS_DMASYNC_POSTWRITE);
3570				bus_dmamap_unload(txq->ift_buf_tag,
3571				    txq->ift_sds.ifsd_map[cidx]);
3572			}
3573			/* XXX we don't support any drivers that batch packets yet */
3574			MPASS(m->m_nextpkt == NULL);
3575			m_freem(m);
3576			ifsd_m[cidx] = NULL;
3577#if MEMORY_LOGGING
3578			txq->ift_dequeued++;
3579#endif
3580			DBG_COUNTER_INC(tx_frees);
3581		}
3582		if (__predict_false(++cidx == qsize)) {
3583			cidx = 0;
3584			gen = 0;
3585		}
3586	}
3587	txq->ift_cidx = cidx;
3588	txq->ift_gen = gen;
3589}
3590
3591static __inline int
3592iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
3593{
3594	int reclaim;
3595	if_ctx_t ctx = txq->ift_ctx;
3596
3597	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
3598	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
3599
3600	/*
3601	 * Need a rate-limiting check so that this isn't called every time
3602	 */
3603	iflib_tx_credits_update(ctx, txq);
3604	reclaim = DESC_RECLAIMABLE(txq);
3605
3606	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
3607#ifdef INVARIANTS
3608		if (iflib_verbose_debug) {
3609			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
3610			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
3611			       reclaim, thresh);
3612
3613		}
3614#endif
3615		return (0);
3616	}
3617	iflib_tx_desc_free(txq, reclaim);
3618	txq->ift_cleaned += reclaim;
3619	txq->ift_in_use -= reclaim;
3620
3621	return (reclaim);
3622}
3623
3624static struct mbuf **
3625_ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
3626{
3627	int next, size;
3628	struct mbuf **items;
3629
3630	size = r->size;
3631	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
3632	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
3633
3634	prefetch(items[(cidx + offset) & (size-1)]);
3635	if (remaining > 1) {
3636		prefetch2cachelines(&items[next]);
3637		prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
3638		prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
3639		prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
3640	}
3641	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
3642}
3643
3644static void
3645iflib_txq_check_drain(iflib_txq_t txq, int budget)
3646{
3647
3648	ifmp_ring_check_drainage(txq->ift_br, budget);
3649}
3650
3651static uint32_t
3652iflib_txq_can_drain(struct ifmp_ring *r)
3653{
3654	iflib_txq_t txq = r->cookie;
3655	if_ctx_t ctx = txq->ift_ctx;
3656
3657	if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
3658		return (1);
3659	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3660	    BUS_DMASYNC_POSTREAD);
3661	return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
3662	    false));
3663}
3664
3665static uint32_t
3666iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
3667{
3668	iflib_txq_t txq = r->cookie;
3669	if_ctx_t ctx = txq->ift_ctx;
3670	if_t ifp = ctx->ifc_ifp;
3671	struct mbuf *m, **mp;
3672	int avail, bytes_sent, consumed, count, err, i, in_use_prev;
3673	int mcast_sent, pkt_sent, reclaimed, txq_avail;
3674	bool do_prefetch, rang, ring;
3675
3676	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
3677			    !LINK_ACTIVE(ctx))) {
3678		DBG_COUNTER_INC(txq_drain_notready);
3679		return (0);
3680	}
3681	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
3682	rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use);
3683	avail = IDXDIFF(pidx, cidx, r->size);
3684	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
3685		DBG_COUNTER_INC(txq_drain_flushing);
3686		for (i = 0; i < avail; i++) {
3687			if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq))
3688				m_free(r->items[(cidx + i) & (r->size-1)]);
3689			r->items[(cidx + i) & (r->size-1)] = NULL;
3690		}
3691		return (avail);
3692	}
3693
3694	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
3695		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3696		CALLOUT_LOCK(txq);
3697		callout_stop(&txq->ift_timer);
3698		CALLOUT_UNLOCK(txq);
3699		DBG_COUNTER_INC(txq_drain_oactive);
3700		return (0);
3701	}
3702	if (reclaimed)
3703		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3704	consumed = mcast_sent = bytes_sent = pkt_sent = 0;
3705	count = MIN(avail, TX_BATCH_SIZE);
3706#ifdef INVARIANTS
3707	if (iflib_verbose_debug)
3708		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
3709		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
3710#endif
3711	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
3712	txq_avail = TXQ_AVAIL(txq);
3713	err = 0;
3714	for (i = 0; i < count && txq_avail > MAX_TX_DESC(ctx) + 2; i++) {
3715		int rem = do_prefetch ? count - i : 0;
3716
3717		mp = _ring_peek_one(r, cidx, i, rem);
3718		MPASS(mp != NULL && *mp != NULL);
3719		if (__predict_false(*mp == (struct mbuf *)txq)) {
3720			consumed++;
3721			continue;
3722		}
3723		in_use_prev = txq->ift_in_use;
3724		err = iflib_encap(txq, mp);
3725		if (__predict_false(err)) {
3726			/* no room - bail out */
3727			if (err == ENOBUFS)
3728				break;
3729			consumed++;
3730			/* we can't send this packet - skip it */
3731			continue;
3732		}
3733		consumed++;
3734		pkt_sent++;
3735		m = *mp;
3736		DBG_COUNTER_INC(tx_sent);
3737		bytes_sent += m->m_pkthdr.len;
3738		mcast_sent += !!(m->m_flags & M_MCAST);
3739		txq_avail = TXQ_AVAIL(txq);
3740
3741		txq->ift_db_pending += (txq->ift_in_use - in_use_prev);
3742		ETHER_BPF_MTAP(ifp, m);
3743		if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING)))
3744			break;
3745		rang = iflib_txd_db_check(ctx, txq, false, in_use_prev);
3746	}
3747
3748	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
3749	ring = rang ? false  : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx));
3750	iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use);
3751	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
3752	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
3753	if (mcast_sent)
3754		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
3755#ifdef INVARIANTS
3756	if (iflib_verbose_debug)
3757		printf("consumed=%d\n", consumed);
3758#endif
3759	return (consumed);
3760}
3761
3762static uint32_t
3763iflib_txq_drain_always(struct ifmp_ring *r)
3764{
3765	return (1);
3766}
3767
3768static uint32_t
3769iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
3770{
3771	int i, avail;
3772	struct mbuf **mp;
3773	iflib_txq_t txq;
3774
3775	txq = r->cookie;
3776
3777	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3778	CALLOUT_LOCK(txq);
3779	callout_stop(&txq->ift_timer);
3780	CALLOUT_UNLOCK(txq);
3781
3782	avail = IDXDIFF(pidx, cidx, r->size);
3783	for (i = 0; i < avail; i++) {
3784		mp = _ring_peek_one(r, cidx, i, avail - i);
3785		if (__predict_false(*mp == (struct mbuf *)txq))
3786			continue;
3787		m_freem(*mp);
3788		DBG_COUNTER_INC(tx_frees);
3789	}
3790	MPASS(ifmp_ring_is_stalled(r) == 0);
3791	return (avail);
3792}
3793
3794static void
3795iflib_ifmp_purge(iflib_txq_t txq)
3796{
3797	struct ifmp_ring *r;
3798
3799	r = txq->ift_br;
3800	r->drain = iflib_txq_drain_free;
3801	r->can_drain = iflib_txq_drain_always;
3802
3803	ifmp_ring_check_drainage(r, r->size);
3804
3805	r->drain = iflib_txq_drain;
3806	r->can_drain = iflib_txq_can_drain;
3807}
3808
3809static void
3810_task_fn_tx(void *context)
3811{
3812	iflib_txq_t txq = context;
3813	if_ctx_t ctx = txq->ift_ctx;
3814#if defined(ALTQ) || defined(DEV_NETMAP)
3815	if_t ifp = ctx->ifc_ifp;
3816#endif
3817	int abdicate = ctx->ifc_sysctl_tx_abdicate;
3818
3819#ifdef IFLIB_DIAGNOSTICS
3820	txq->ift_cpu_exec_count[curcpu]++;
3821#endif
3822	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
3823		return;
3824#ifdef DEV_NETMAP
3825	if (if_getcapenable(ifp) & IFCAP_NETMAP) {
3826		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3827		    BUS_DMASYNC_POSTREAD);
3828		if (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, false))
3829			netmap_tx_irq(ifp, txq->ift_id);
3830		if (ctx->ifc_flags & IFC_LEGACY)
3831			IFDI_INTR_ENABLE(ctx);
3832		else
3833			IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
3834		return;
3835	}
3836#endif
3837#ifdef ALTQ
3838	if (ALTQ_IS_ENABLED(&ifp->if_snd))
3839		iflib_altq_if_start(ifp);
3840#endif
3841	if (txq->ift_db_pending)
3842		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
3843	else if (!abdicate)
3844		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
3845	/*
3846	 * When abdicating, we always need to check drainage, not just when we don't enqueue
3847	 */
3848	if (abdicate)
3849		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
3850	if (ctx->ifc_flags & IFC_LEGACY)
3851		IFDI_INTR_ENABLE(ctx);
3852	else
3853		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
3854}
3855
3856static void
3857_task_fn_rx(void *context)
3858{
3859	iflib_rxq_t rxq = context;
3860	if_ctx_t ctx = rxq->ifr_ctx;
3861	bool more;
3862	uint16_t budget;
3863
3864#ifdef IFLIB_DIAGNOSTICS
3865	rxq->ifr_cpu_exec_count[curcpu]++;
3866#endif
3867	DBG_COUNTER_INC(task_fn_rxs);
3868	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
3869		return;
3870	more = true;
3871#ifdef DEV_NETMAP
3872	if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP) {
3873		u_int work = 0;
3874		if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work)) {
3875			more = false;
3876		}
3877	}
3878#endif
3879	budget = ctx->ifc_sysctl_rx_budget;
3880	if (budget == 0)
3881		budget = 16;	/* XXX */
3882	if (more == false || (more = iflib_rxeof(rxq, budget)) == false) {
3883		if (ctx->ifc_flags & IFC_LEGACY)
3884			IFDI_INTR_ENABLE(ctx);
3885		else
3886			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
3887		DBG_COUNTER_INC(rx_intr_enables);
3888	}
3889	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
3890		return;
3891	if (more)
3892		GROUPTASK_ENQUEUE(&rxq->ifr_task);
3893}
3894
3895static void
3896_task_fn_admin(void *context)
3897{
3898	if_ctx_t ctx = context;
3899	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
3900	iflib_txq_t txq;
3901	int i;
3902	bool oactive, running, do_reset, do_watchdog, in_detach;
3903	uint32_t reset_on = hz / 2;
3904
3905	STATE_LOCK(ctx);
3906	running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
3907	oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
3908	do_reset = (ctx->ifc_flags & IFC_DO_RESET);
3909	do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
3910	in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
3911	ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG);
3912	STATE_UNLOCK(ctx);
3913
3914	if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
3915		return;
3916	if (in_detach)
3917		return;
3918
3919	CTX_LOCK(ctx);
3920	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
3921		CALLOUT_LOCK(txq);
3922		callout_stop(&txq->ift_timer);
3923		CALLOUT_UNLOCK(txq);
3924	}
3925	if (do_watchdog) {
3926		ctx->ifc_watchdog_events++;
3927		IFDI_WATCHDOG_RESET(ctx);
3928	}
3929	IFDI_UPDATE_ADMIN_STATUS(ctx);
3930	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
3931#ifdef DEV_NETMAP
3932		reset_on = hz / 2;
3933		if (if_getcapenable(ctx->ifc_ifp) & IFCAP_NETMAP)
3934			iflib_netmap_timer_adjust(ctx, txq, &reset_on);
3935#endif
3936		callout_reset_on(&txq->ift_timer, reset_on, iflib_timer, txq, txq->ift_timer.c_cpu);
3937	}
3938	IFDI_LINK_INTR_ENABLE(ctx);
3939	if (do_reset)
3940		iflib_if_init_locked(ctx);
3941	CTX_UNLOCK(ctx);
3942
3943	if (LINK_ACTIVE(ctx) == 0)
3944		return;
3945	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
3946		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
3947}
3948
3949
3950static void
3951_task_fn_iov(void *context)
3952{
3953	if_ctx_t ctx = context;
3954
3955	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
3956	    !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
3957		return;
3958
3959	CTX_LOCK(ctx);
3960	IFDI_VFLR_HANDLE(ctx);
3961	CTX_UNLOCK(ctx);
3962}
3963
3964static int
3965iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
3966{
3967	int err;
3968	if_int_delay_info_t info;
3969	if_ctx_t ctx;
3970
3971	info = (if_int_delay_info_t)arg1;
3972	ctx = info->iidi_ctx;
3973	info->iidi_req = req;
3974	info->iidi_oidp = oidp;
3975	CTX_LOCK(ctx);
3976	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
3977	CTX_UNLOCK(ctx);
3978	return (err);
3979}
3980
3981/*********************************************************************
3982 *
3983 *  IFNET FUNCTIONS
3984 *
3985 **********************************************************************/
3986
3987static void
3988iflib_if_init_locked(if_ctx_t ctx)
3989{
3990	iflib_stop(ctx);
3991	iflib_init_locked(ctx);
3992}
3993
3994
3995static void
3996iflib_if_init(void *arg)
3997{
3998	if_ctx_t ctx = arg;
3999
4000	CTX_LOCK(ctx);
4001	iflib_if_init_locked(ctx);
4002	CTX_UNLOCK(ctx);
4003}
4004
4005static int
4006iflib_if_transmit(if_t ifp, struct mbuf *m)
4007{
4008	if_ctx_t	ctx = if_getsoftc(ifp);
4009
4010	iflib_txq_t txq;
4011	int err, qidx;
4012	int abdicate = ctx->ifc_sysctl_tx_abdicate;
4013
4014	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
4015		DBG_COUNTER_INC(tx_frees);
4016		m_freem(m);
4017		return (ENETDOWN);
4018	}
4019
4020	MPASS(m->m_nextpkt == NULL);
4021	/* ALTQ-enabled interfaces always use queue 0. */
4022	qidx = 0;
4023	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd))
4024		qidx = QIDX(ctx, m);
4025	/*
4026	 * XXX calculate buf_ring based on flowid (divvy up bits?)
4027	 */
4028	txq = &ctx->ifc_txqs[qidx];
4029
4030#ifdef DRIVER_BACKPRESSURE
4031	if (txq->ift_closed) {
4032		while (m != NULL) {
4033			next = m->m_nextpkt;
4034			m->m_nextpkt = NULL;
4035			m_freem(m);
4036			DBG_COUNTER_INC(tx_frees);
4037			m = next;
4038		}
4039		return (ENOBUFS);
4040	}
4041#endif
4042#ifdef notyet
4043	qidx = count = 0;
4044	mp = marr;
4045	next = m;
4046	do {
4047		count++;
4048		next = next->m_nextpkt;
4049	} while (next != NULL);
4050
4051	if (count > nitems(marr))
4052		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
4053			/* XXX check nextpkt */
4054			m_freem(m);
4055			/* XXX simplify for now */
4056			DBG_COUNTER_INC(tx_frees);
4057			return (ENOBUFS);
4058		}
4059	for (next = m, i = 0; next != NULL; i++) {
4060		mp[i] = next;
4061		next = next->m_nextpkt;
4062		mp[i]->m_nextpkt = NULL;
4063	}
4064#endif
4065	DBG_COUNTER_INC(tx_seen);
4066	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
4067
4068	if (abdicate)
4069		GROUPTASK_ENQUEUE(&txq->ift_task);
4070 	if (err) {
4071		if (!abdicate)
4072			GROUPTASK_ENQUEUE(&txq->ift_task);
4073		/* support forthcoming later */
4074#ifdef DRIVER_BACKPRESSURE
4075		txq->ift_closed = TRUE;
4076#endif
4077		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4078		m_freem(m);
4079		DBG_COUNTER_INC(tx_frees);
4080	}
4081
4082	return (err);
4083}
4084
4085#ifdef ALTQ
4086/*
4087 * The overall approach to integrating iflib with ALTQ is to continue to use
4088 * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
4089 * ring.  Technically, when using ALTQ, queueing to an intermediate mp_ring
4090 * is redundant/unnecessary, but doing so minimizes the amount of
4091 * ALTQ-specific code required in iflib.  It is assumed that the overhead of
4092 * redundantly queueing to an intermediate mp_ring is swamped by the
4093 * performance limitations inherent in using ALTQ.
4094 *
4095 * When ALTQ support is compiled in, all iflib drivers will use a transmit
4096 * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
4097 * given interface.  If ALTQ is enabled for an interface, then all
4098 * transmitted packets for that interface will be submitted to the ALTQ
4099 * subsystem via IFQ_ENQUEUE().  We don't use the legacy if_transmit()
4100 * implementation because it uses IFQ_HANDOFF(), which will duplicatively
4101 * update stats that the iflib machinery handles, and which is sensitve to
4102 * the disused IFF_DRV_OACTIVE flag.  Additionally, iflib_altq_if_start()
4103 * will be installed as the start routine for use by ALTQ facilities that
4104 * need to trigger queue drains on a scheduled basis.
4105 *
4106 */
4107static void
4108iflib_altq_if_start(if_t ifp)
4109{
4110	struct ifaltq *ifq = &ifp->if_snd;
4111	struct mbuf *m;
4112
4113	IFQ_LOCK(ifq);
4114	IFQ_DEQUEUE_NOLOCK(ifq, m);
4115	while (m != NULL) {
4116		iflib_if_transmit(ifp, m);
4117		IFQ_DEQUEUE_NOLOCK(ifq, m);
4118	}
4119	IFQ_UNLOCK(ifq);
4120}
4121
4122static int
4123iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
4124{
4125	int err;
4126
4127	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
4128		IFQ_ENQUEUE(&ifp->if_snd, m, err);
4129		if (err == 0)
4130			iflib_altq_if_start(ifp);
4131	} else
4132		err = iflib_if_transmit(ifp, m);
4133
4134	return (err);
4135}
4136#endif /* ALTQ */
4137
4138static void
4139iflib_if_qflush(if_t ifp)
4140{
4141	if_ctx_t ctx = if_getsoftc(ifp);
4142	iflib_txq_t txq = ctx->ifc_txqs;
4143	int i;
4144
4145	STATE_LOCK(ctx);
4146	ctx->ifc_flags |= IFC_QFLUSH;
4147	STATE_UNLOCK(ctx);
4148	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
4149		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
4150			iflib_txq_check_drain(txq, 0);
4151	STATE_LOCK(ctx);
4152	ctx->ifc_flags &= ~IFC_QFLUSH;
4153	STATE_UNLOCK(ctx);
4154
4155	/*
4156	 * When ALTQ is enabled, this will also take care of purging the
4157	 * ALTQ queue(s).
4158	 */
4159	if_qflush(ifp);
4160}
4161
4162
4163#define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
4164		     IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
4165		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
4166		     IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM)
4167
4168static int
4169iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
4170{
4171	if_ctx_t ctx = if_getsoftc(ifp);
4172	struct ifreq	*ifr = (struct ifreq *)data;
4173#if defined(INET) || defined(INET6)
4174	struct ifaddr	*ifa = (struct ifaddr *)data;
4175#endif
4176	bool		avoid_reset = false;
4177	int		err = 0, reinit = 0, bits;
4178
4179	switch (command) {
4180	case SIOCSIFADDR:
4181#ifdef INET
4182		if (ifa->ifa_addr->sa_family == AF_INET)
4183			avoid_reset = true;
4184#endif
4185#ifdef INET6
4186		if (ifa->ifa_addr->sa_family == AF_INET6)
4187			avoid_reset = true;
4188#endif
4189		/*
4190		** Calling init results in link renegotiation,
4191		** so we avoid doing it when possible.
4192		*/
4193		if (avoid_reset) {
4194			if_setflagbits(ifp, IFF_UP,0);
4195			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
4196				reinit = 1;
4197#ifdef INET
4198			if (!(if_getflags(ifp) & IFF_NOARP))
4199				arp_ifinit(ifp, ifa);
4200#endif
4201		} else
4202			err = ether_ioctl(ifp, command, data);
4203		break;
4204	case SIOCSIFMTU:
4205		CTX_LOCK(ctx);
4206		if (ifr->ifr_mtu == if_getmtu(ifp)) {
4207			CTX_UNLOCK(ctx);
4208			break;
4209		}
4210		bits = if_getdrvflags(ifp);
4211		/* stop the driver and free any clusters before proceeding */
4212		iflib_stop(ctx);
4213
4214		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
4215			STATE_LOCK(ctx);
4216			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
4217				ctx->ifc_flags |= IFC_MULTISEG;
4218			else
4219				ctx->ifc_flags &= ~IFC_MULTISEG;
4220			STATE_UNLOCK(ctx);
4221			err = if_setmtu(ifp, ifr->ifr_mtu);
4222		}
4223		iflib_init_locked(ctx);
4224		STATE_LOCK(ctx);
4225		if_setdrvflags(ifp, bits);
4226		STATE_UNLOCK(ctx);
4227		CTX_UNLOCK(ctx);
4228		break;
4229	case SIOCSIFFLAGS:
4230		CTX_LOCK(ctx);
4231		if (if_getflags(ifp) & IFF_UP) {
4232			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4233				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
4234				    (IFF_PROMISC | IFF_ALLMULTI)) {
4235					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
4236				}
4237			} else
4238				reinit = 1;
4239		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4240			iflib_stop(ctx);
4241		}
4242		ctx->ifc_if_flags = if_getflags(ifp);
4243		CTX_UNLOCK(ctx);
4244		break;
4245	case SIOCADDMULTI:
4246	case SIOCDELMULTI:
4247		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4248			CTX_LOCK(ctx);
4249			IFDI_INTR_DISABLE(ctx);
4250			IFDI_MULTI_SET(ctx);
4251			IFDI_INTR_ENABLE(ctx);
4252			CTX_UNLOCK(ctx);
4253		}
4254		break;
4255	case SIOCSIFMEDIA:
4256		CTX_LOCK(ctx);
4257		IFDI_MEDIA_SET(ctx);
4258		CTX_UNLOCK(ctx);
4259		/* FALLTHROUGH */
4260	case SIOCGIFMEDIA:
4261#ifndef __HAIKU__
4262	case SIOCGIFXMEDIA:
4263#endif
4264		err = ifmedia_ioctl(ifp, ifr, ctx->ifc_mediap, command);
4265		break;
4266#ifndef __HAIKU__
4267	case SIOCGI2C:
4268	{
4269		struct ifi2creq i2c;
4270
4271		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4272		if (err != 0)
4273			break;
4274		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
4275			err = EINVAL;
4276			break;
4277		}
4278		if (i2c.len > sizeof(i2c.data)) {
4279			err = EINVAL;
4280			break;
4281		}
4282
4283		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
4284			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4285			    sizeof(i2c));
4286		break;
4287	}
4288#endif
4289	case SIOCSIFCAP:
4290	{
4291		int mask, setmask, oldmask;
4292
4293		oldmask = if_getcapenable(ifp);
4294		mask = ifr->ifr_reqcap ^ oldmask;
4295		mask &= ctx->ifc_softc_ctx.isc_capabilities;
4296		setmask = 0;
4297#ifdef TCP_OFFLOAD
4298		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
4299#endif
4300		setmask |= (mask & IFCAP_FLAGS);
4301		setmask |= (mask & IFCAP_WOL);
4302
4303		/*
4304		 * If any RX csum has changed, change all the ones that
4305		 * are supported by the driver.
4306		 */
4307		if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
4308			setmask |= ctx->ifc_softc_ctx.isc_capabilities &
4309			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
4310		}
4311
4312		/*
4313		 * want to ensure that traffic has stopped before we change any of the flags
4314		 */
4315		if (setmask) {
4316			CTX_LOCK(ctx);
4317			bits = if_getdrvflags(ifp);
4318			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
4319				iflib_stop(ctx);
4320			STATE_LOCK(ctx);
4321			if_togglecapenable(ifp, setmask);
4322			STATE_UNLOCK(ctx);
4323			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
4324				iflib_init_locked(ctx);
4325			STATE_LOCK(ctx);
4326			if_setdrvflags(ifp, bits);
4327			STATE_UNLOCK(ctx);
4328			CTX_UNLOCK(ctx);
4329		}
4330		if_vlancap(ifp);
4331		break;
4332	}
4333	case SIOCGPRIVATE_0:
4334	case SIOCSDRVSPEC:
4335	case SIOCGDRVSPEC:
4336		CTX_LOCK(ctx);
4337		err = IFDI_PRIV_IOCTL(ctx, command, data);
4338		CTX_UNLOCK(ctx);
4339		break;
4340	default:
4341		err = ether_ioctl(ifp, command, data);
4342		break;
4343	}
4344	if (reinit)
4345		iflib_if_init(ctx);
4346	return (err);
4347}
4348
4349static uint64_t
4350iflib_if_get_counter(if_t ifp, ift_counter cnt)
4351{
4352	if_ctx_t ctx = if_getsoftc(ifp);
4353
4354	return (IFDI_GET_COUNTER(ctx, cnt));
4355}
4356
4357/*********************************************************************
4358 *
4359 *  OTHER FUNCTIONS EXPORTED TO THE STACK
4360 *
4361 **********************************************************************/
4362
4363static void
4364iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
4365{
4366	if_ctx_t ctx = if_getsoftc(ifp);
4367
4368	if ((void *)ctx != arg)
4369		return;
4370
4371	if ((vtag == 0) || (vtag > 4095))
4372		return;
4373
4374	CTX_LOCK(ctx);
4375	IFDI_VLAN_REGISTER(ctx, vtag);
4376	/* Re-init to load the changes */
4377	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
4378		iflib_if_init_locked(ctx);
4379	CTX_UNLOCK(ctx);
4380}
4381
4382static void
4383iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
4384{
4385	if_ctx_t ctx = if_getsoftc(ifp);
4386
4387	if ((void *)ctx != arg)
4388		return;
4389
4390	if ((vtag == 0) || (vtag > 4095))
4391		return;
4392
4393	CTX_LOCK(ctx);
4394	IFDI_VLAN_UNREGISTER(ctx, vtag);
4395	/* Re-init to load the changes */
4396	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
4397		iflib_if_init_locked(ctx);
4398	CTX_UNLOCK(ctx);
4399}
4400
4401static void
4402iflib_led_func(void *arg, int onoff)
4403{
4404	if_ctx_t ctx = arg;
4405
4406	CTX_LOCK(ctx);
4407	IFDI_LED_FUNC(ctx, onoff);
4408	CTX_UNLOCK(ctx);
4409}
4410
4411/*********************************************************************
4412 *
4413 *  BUS FUNCTION DEFINITIONS
4414 *
4415 **********************************************************************/
4416
4417int
4418iflib_device_probe(device_t dev)
4419{
4420	const pci_vendor_info_t *ent;
4421	if_shared_ctx_t sctx;
4422	uint16_t pci_device_id, pci_rev_id, pci_subdevice_id, pci_subvendor_id;
4423	uint16_t pci_vendor_id;
4424
4425	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
4426		return (ENOTSUP);
4427
4428	pci_vendor_id = pci_get_vendor(dev);
4429	pci_device_id = pci_get_device(dev);
4430	pci_subvendor_id = pci_get_subvendor(dev);
4431	pci_subdevice_id = pci_get_subdevice(dev);
4432	pci_rev_id = pci_get_revid(dev);
4433	if (sctx->isc_parse_devinfo != NULL)
4434		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
4435
4436	ent = sctx->isc_vendor_info;
4437	while (ent->pvi_vendor_id != 0) {
4438		if (pci_vendor_id != ent->pvi_vendor_id) {
4439			ent++;
4440			continue;
4441		}
4442		if ((pci_device_id == ent->pvi_device_id) &&
4443		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
4444		     (ent->pvi_subvendor_id == 0)) &&
4445		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
4446		     (ent->pvi_subdevice_id == 0)) &&
4447		    ((pci_rev_id == ent->pvi_rev_id) ||
4448		     (ent->pvi_rev_id == 0))) {
4449
4450			device_set_desc_copy(dev, ent->pvi_name);
4451			/* this needs to be changed to zero if the bus probing code
4452			 * ever stops re-probing on best match because the sctx
4453			 * may have its values over written by register calls
4454			 * in subsequent probes
4455			 */
4456			return (BUS_PROBE_DEFAULT);
4457		}
4458		ent++;
4459	}
4460	return (ENXIO);
4461}
4462
4463int
4464iflib_device_probe_vendor(device_t dev)
4465{
4466	int probe;
4467
4468	probe = iflib_device_probe(dev);
4469#ifndef __HAIKU__
4470	if (probe == BUS_PROBE_DEFAULT)
4471		return (BUS_PROBE_VENDOR);
4472	else
4473#endif
4474		return (probe);
4475}
4476
4477static void
4478iflib_reset_qvalues(if_ctx_t ctx)
4479{
4480	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4481	if_shared_ctx_t sctx = ctx->ifc_sctx;
4482	device_t dev = ctx->ifc_dev;
4483	int i;
4484
4485	if (ctx->ifc_sysctl_ntxqs != 0)
4486		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
4487	if (ctx->ifc_sysctl_nrxqs != 0)
4488		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
4489
4490	for (i = 0; i < sctx->isc_ntxqs; i++) {
4491		if (ctx->ifc_sysctl_ntxds[i] != 0)
4492			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
4493		else
4494			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4495	}
4496
4497	for (i = 0; i < sctx->isc_nrxqs; i++) {
4498		if (ctx->ifc_sysctl_nrxds[i] != 0)
4499			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
4500		else
4501			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4502	}
4503
4504	for (i = 0; i < sctx->isc_nrxqs; i++) {
4505		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
4506			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
4507				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
4508			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
4509		}
4510		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
4511			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
4512				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
4513			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
4514		}
4515		if (!powerof2(scctx->isc_nrxd[i])) {
4516			device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n",
4517				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]);
4518			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4519		}
4520	}
4521
4522	for (i = 0; i < sctx->isc_ntxqs; i++) {
4523		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
4524			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
4525				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
4526			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
4527		}
4528		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
4529			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
4530				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
4531			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
4532		}
4533		if (!powerof2(scctx->isc_ntxd[i])) {
4534			device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n",
4535				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]);
4536			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4537		}
4538	}
4539}
4540
4541static void
4542iflib_add_pfil(if_ctx_t ctx)
4543{
4544#ifndef __HAIKU__
4545	struct pfil_head *pfil;
4546	struct pfil_head_args pa;
4547	iflib_rxq_t rxq;
4548	int i;
4549
4550	pa.pa_version = PFIL_VERSION;
4551	pa.pa_flags = PFIL_IN;
4552	pa.pa_type = PFIL_TYPE_ETHERNET;
4553	pa.pa_headname = ctx->ifc_ifp->if_xname;
4554	pfil = pfil_head_register(&pa);
4555
4556	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
4557		rxq->pfil = pfil;
4558	}
4559#endif
4560}
4561
4562static void
4563iflib_rem_pfil(if_ctx_t ctx)
4564{
4565#ifndef __HAIKU__
4566	struct pfil_head *pfil;
4567	iflib_rxq_t rxq;
4568	int i;
4569
4570	rxq = ctx->ifc_rxqs;
4571	pfil = rxq->pfil;
4572	for (i = 0; i < NRXQSETS(ctx); i++, rxq++) {
4573		rxq->pfil = NULL;
4574	}
4575	pfil_head_unregister(pfil);
4576#endif
4577}
4578
4579static uint16_t
4580get_ctx_core_offset(if_ctx_t ctx)
4581{
4582#ifndef __HAIKU__
4583	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4584	struct cpu_offset *op;
4585	uint16_t qc;
4586	uint16_t ret = ctx->ifc_sysctl_core_offset;
4587
4588	if (ret != CORE_OFFSET_UNSPECIFIED)
4589		return (ret);
4590
4591	if (ctx->ifc_sysctl_separate_txrx)
4592		qc = scctx->isc_ntxqsets + scctx->isc_nrxqsets;
4593	else
4594		qc = max(scctx->isc_ntxqsets, scctx->isc_nrxqsets);
4595
4596	mtx_lock(&cpu_offset_mtx);
4597	SLIST_FOREACH(op, &cpu_offsets, entries) {
4598		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
4599			ret = op->offset;
4600			op->offset += qc;
4601			MPASS(op->refcount < UINT_MAX);
4602			op->refcount++;
4603			break;
4604		}
4605	}
4606	if (ret == CORE_OFFSET_UNSPECIFIED) {
4607		ret = 0;
4608		op = malloc(sizeof(struct cpu_offset), M_IFLIB,
4609		    M_NOWAIT | M_ZERO);
4610		if (op == NULL) {
4611			device_printf(ctx->ifc_dev,
4612			    "allocation for cpu offset failed.\n");
4613		} else {
4614			op->offset = qc;
4615			op->refcount = 1;
4616			CPU_COPY(&ctx->ifc_cpus, &op->set);
4617			SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
4618		}
4619	}
4620	mtx_unlock(&cpu_offset_mtx);
4621
4622	return (ret);
4623#else
4624	return 0;
4625#endif
4626}
4627
4628static void
4629unref_ctx_core_offset(if_ctx_t ctx)
4630{
4631#ifndef __HAIKU__
4632	struct cpu_offset *op, *top;
4633
4634	mtx_lock(&cpu_offset_mtx);
4635	SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) {
4636		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
4637			MPASS(op->refcount > 0);
4638			op->refcount--;
4639			if (op->refcount == 0) {
4640				SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries);
4641				free(op, M_IFLIB);
4642			}
4643			break;
4644		}
4645	}
4646	mtx_unlock(&cpu_offset_mtx);
4647#endif
4648}
4649
4650int
4651iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
4652{
4653	if_ctx_t ctx;
4654	if_t ifp;
4655	if_softc_ctx_t scctx;
4656	kobjop_desc_t kobj_desc;
4657	kobj_method_t *kobj_method;
4658	int err, msix, rid;
4659	uint16_t main_rxq, main_txq;
4660
4661	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
4662
4663	if (sc == NULL) {
4664		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
4665		device_set_softc(dev, ctx);
4666		ctx->ifc_flags |= IFC_SC_ALLOCATED;
4667	}
4668
4669	ctx->ifc_sctx = sctx;
4670	ctx->ifc_dev = dev;
4671	ctx->ifc_softc = sc;
4672
4673	if ((err = iflib_register(ctx)) != 0) {
4674		device_printf(dev, "iflib_register failed %d\n", err);
4675		goto fail_ctx_free;
4676	}
4677	iflib_add_device_sysctl_pre(ctx);
4678
4679	scctx = &ctx->ifc_softc_ctx;
4680	ifp = ctx->ifc_ifp;
4681
4682	iflib_reset_qvalues(ctx);
4683	CTX_LOCK(ctx);
4684	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
4685		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
4686		goto fail_unlock;
4687	}
4688	_iflib_pre_assert(scctx);
4689	ctx->ifc_txrx = *scctx->isc_txrx;
4690
4691	if (sctx->isc_flags & IFLIB_DRIVER_MEDIA)
4692		ctx->ifc_mediap = scctx->isc_media;
4693
4694#ifdef INVARIANTS
4695	if (scctx->isc_capabilities & IFCAP_TXCSUM)
4696		MPASS(scctx->isc_tx_csum_flags);
4697#endif
4698
4699	if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS);
4700	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS);
4701
4702	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
4703		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
4704	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
4705		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
4706
4707	main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
4708	main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
4709
4710	/* XXX change for per-queue sizes */
4711	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
4712	    scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
4713
4714	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
4715	    MAX_SINGLE_PACKET_FRACTION)
4716		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
4717		    MAX_SINGLE_PACKET_FRACTION);
4718	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
4719	    MAX_SINGLE_PACKET_FRACTION)
4720		scctx->isc_tx_tso_segments_max = max(1,
4721		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
4722
4723	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
4724	if (if_getcapabilities(ifp) & IFCAP_TSO) {
4725#ifndef __HAIKU__
4726		/*
4727		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
4728		 * but some MACs do.
4729		 */
4730		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
4731		    IP_MAXPACKET));
4732		/*
4733		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
4734		 * into account.  In the worst case, each of these calls will
4735		 * add another mbuf and, thus, the requirement for another DMA
4736		 * segment.  So for best performance, it doesn't make sense to
4737		 * advertize a maximum of TSO segments that typically will
4738		 * require defragmentation in iflib_encap().
4739		 */
4740		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
4741		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
4742#endif
4743	}
4744	if (scctx->isc_rss_table_size == 0)
4745		scctx->isc_rss_table_size = 64;
4746	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
4747
4748	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
4749	/* XXX format name */
4750	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
4751	    NULL, NULL, "admin");
4752
4753#ifndef __HAIKU__
4754	/* Set up cpu set.  If it fails, use the set of all CPUs. */
4755	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
4756		device_printf(dev, "Unable to fetch CPU list\n");
4757		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
4758	}
4759	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
4760#endif
4761
4762	/*
4763	** Now set up MSI or MSI-X, should return us the number of supported
4764	** vectors (will be 1 for a legacy interrupt and MSI).
4765	*/
4766	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
4767		msix = scctx->isc_vectors;
4768	} else if (scctx->isc_msix_bar != 0)
4769	       /*
4770		* The simple fact that isc_msix_bar is not 0 does not mean we
4771		* we have a good value there that is known to work.
4772		*/
4773		msix = iflib_msix_init(ctx);
4774	else {
4775		scctx->isc_vectors = 1;
4776		scctx->isc_ntxqsets = 1;
4777		scctx->isc_nrxqsets = 1;
4778		scctx->isc_intr = IFLIB_INTR_LEGACY;
4779		msix = 0;
4780	}
4781	/* Get memory for the station queues */
4782	if ((err = iflib_queues_alloc(ctx))) {
4783		device_printf(dev, "Unable to allocate queue memory\n");
4784		goto fail_intr_free;
4785	}
4786
4787	if ((err = iflib_qset_structures_setup(ctx)))
4788		goto fail_queues;
4789
4790	/*
4791	 * Now that we know how many queues there are, get the core offset.
4792	 */
4793	ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
4794
4795	/*
4796	 * Group taskqueues aren't properly set up until SMP is started,
4797	 * so we disable interrupts until we can handle them post
4798	 * SI_SUB_SMP.
4799	 *
4800	 * XXX: disabling interrupts doesn't actually work, at least for
4801	 * the non-MSI case.  When they occur before SI_SUB_SMP completes,
4802	 * we do null handling and depend on this not causing too large an
4803	 * interrupt storm.
4804	 */
4805	IFDI_INTR_DISABLE(ctx);
4806
4807	if (msix > 1) {
4808		/*
4809		 * When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable
4810		 * aren't the default NULL implementation.
4811		 */
4812		kobj_desc = &ifdi_rx_queue_intr_enable_desc;
4813#ifdef __HAIKU__
4814		kobj_method = kobj_lookup_method(ctx->ops.cls, NULL,
4815#else
4816		kobj_method = kobj_lookup_method(((kobj_t)ctx), NULL,
4817#endif
4818		    kobj_desc);
4819		if (kobj_method == &kobj_desc->deflt) {
4820			device_printf(dev,
4821			    "MSI-X requires ifdi_rx_queue_intr_enable method");
4822			err = EOPNOTSUPP;
4823			goto fail_queues;
4824		}
4825		kobj_desc = &ifdi_tx_queue_intr_enable_desc;
4826#ifdef __HAIKU__
4827		kobj_method = kobj_lookup_method(ctx->ops.cls, NULL,
4828#else
4829		kobj_method = kobj_lookup_method(((kobj_t)ctx), NULL,
4830#endif
4831		    kobj_desc);
4832		if (kobj_method == &kobj_desc->deflt) {
4833			device_printf(dev,
4834			    "MSI-X requires ifdi_tx_queue_intr_enable method");
4835			err = EOPNOTSUPP;
4836			goto fail_queues;
4837		}
4838
4839		/*
4840		 * Assign the MSI-X vectors.
4841		 * Note that the default NULL ifdi_msix_intr_assign method will
4842		 * fail here, too.
4843		 */
4844		err = IFDI_MSIX_INTR_ASSIGN(ctx, msix);
4845		if (err != 0) {
4846			device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n",
4847			    err);
4848			goto fail_queues;
4849		}
4850	} else if (scctx->isc_intr != IFLIB_INTR_MSIX) {
4851		rid = 0;
4852		if (scctx->isc_intr == IFLIB_INTR_MSI) {
4853			MPASS(msix == 1);
4854			rid = 1;
4855		}
4856		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
4857			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
4858			goto fail_queues;
4859		}
4860	} else {
4861		device_printf(dev,
4862		    "Cannot use iflib with only 1 MSI-X interrupt!\n");
4863		err = ENODEV;
4864		goto fail_intr_free;
4865	}
4866
4867	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
4868
4869	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
4870		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
4871		goto fail_detach;
4872	}
4873
4874	/*
4875	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
4876	 * This must appear after the call to ether_ifattach() because
4877	 * ether_ifattach() sets if_hdrlen to the default value.
4878	 */
4879	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
4880		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
4881
4882	if ((err = iflib_netmap_attach(ctx))) {
4883		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
4884		goto fail_detach;
4885	}
4886	*ctxp = ctx;
4887
4888	NETDUMP_SET(ctx->ifc_ifp, iflib);
4889
4890	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
4891	iflib_add_device_sysctl_post(ctx);
4892	iflib_add_pfil(ctx);
4893	ctx->ifc_flags |= IFC_INIT_DONE;
4894	CTX_UNLOCK(ctx);
4895
4896	return (0);
4897
4898fail_detach:
4899	ether_ifdetach(ctx->ifc_ifp);
4900fail_intr_free:
4901	iflib_free_intr_mem(ctx);
4902fail_queues:
4903	iflib_tx_structures_free(ctx);
4904	iflib_rx_structures_free(ctx);
4905	taskqgroup_detach(qgroup_if_config_tqg, &ctx->ifc_admin_task);
4906	IFDI_DETACH(ctx);
4907fail_unlock:
4908	CTX_UNLOCK(ctx);
4909	iflib_deregister(ctx);
4910fail_ctx_free:
4911	device_set_softc(ctx->ifc_dev, NULL);
4912        if (ctx->ifc_flags & IFC_SC_ALLOCATED)
4913                free(ctx->ifc_softc, M_IFLIB);
4914        free(ctx, M_IFLIB);
4915	return (err);
4916}
4917
4918int
4919iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp,
4920					  struct iflib_cloneattach_ctx *clctx)
4921{
4922	int err;
4923	if_ctx_t ctx;
4924	if_t ifp;
4925	if_softc_ctx_t scctx;
4926	int i;
4927	void *sc;
4928	uint16_t main_txq;
4929	uint16_t main_rxq;
4930
4931	ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO);
4932	sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
4933	ctx->ifc_flags |= IFC_SC_ALLOCATED;
4934	if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL))
4935		ctx->ifc_flags |= IFC_PSEUDO;
4936
4937	ctx->ifc_sctx = sctx;
4938	ctx->ifc_softc = sc;
4939	ctx->ifc_dev = dev;
4940
4941	if ((err = iflib_register(ctx)) != 0) {
4942		device_printf(dev, "%s: iflib_register failed %d\n", __func__, err);
4943		goto fail_ctx_free;
4944	}
4945	iflib_add_device_sysctl_pre(ctx);
4946
4947	scctx = &ctx->ifc_softc_ctx;
4948	ifp = ctx->ifc_ifp;
4949
4950	iflib_reset_qvalues(ctx);
4951	CTX_LOCK(ctx);
4952	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
4953		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
4954		goto fail_unlock;
4955	}
4956#ifndef __HAIKU__
4957	if (sctx->isc_flags & IFLIB_GEN_MAC)
4958		ether_gen_addr(ifp, &ctx->ifc_mac);
4959#endif
4960	if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name,
4961								clctx->cc_params)) != 0) {
4962		device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err);
4963		goto fail_ctx_free;
4964	}
4965	ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
4966	ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
4967	ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
4968
4969#ifdef INVARIANTS
4970	if (scctx->isc_capabilities & IFCAP_TXCSUM)
4971		MPASS(scctx->isc_tx_csum_flags);
4972#endif
4973
4974	if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_LINKSTATE);
4975	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE);
4976
4977	ifp->if_flags |= IFF_NOGROUP;
4978	if (sctx->isc_flags & IFLIB_PSEUDO) {
4979		ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
4980
4981		if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
4982			device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
4983			goto fail_detach;
4984		}
4985		*ctxp = ctx;
4986
4987		/*
4988		 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
4989		 * This must appear after the call to ether_ifattach() because
4990		 * ether_ifattach() sets if_hdrlen to the default value.
4991		 */
4992		if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
4993			if_setifheaderlen(ifp,
4994			    sizeof(struct ether_vlan_header));
4995
4996		if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
4997		iflib_add_device_sysctl_post(ctx);
4998		ctx->ifc_flags |= IFC_INIT_DONE;
4999		return (0);
5000	}
5001	_iflib_pre_assert(scctx);
5002	ctx->ifc_txrx = *scctx->isc_txrx;
5003
5004	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
5005		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
5006	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
5007		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
5008
5009	main_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
5010	main_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
5011
5012	/* XXX change for per-queue sizes */
5013	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
5014	    scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
5015
5016	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
5017	    MAX_SINGLE_PACKET_FRACTION)
5018		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
5019		    MAX_SINGLE_PACKET_FRACTION);
5020	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
5021	    MAX_SINGLE_PACKET_FRACTION)
5022		scctx->isc_tx_tso_segments_max = max(1,
5023		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
5024
5025	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
5026	if (if_getcapabilities(ifp) & IFCAP_TSO) {
5027#ifndef __HAIKU__
5028		/*
5029		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
5030		 * but some MACs do.
5031		 */
5032		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
5033		    IP_MAXPACKET));
5034		/*
5035		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
5036		 * into account.  In the worst case, each of these calls will
5037		 * add another mbuf and, thus, the requirement for another DMA
5038		 * segment.  So for best performance, it doesn't make sense to
5039		 * advertize a maximum of TSO segments that typically will
5040		 * require defragmentation in iflib_encap().
5041		 */
5042		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
5043		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
5044#endif
5045	}
5046	if (scctx->isc_rss_table_size == 0)
5047		scctx->isc_rss_table_size = 64;
5048	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
5049
5050	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
5051	/* XXX format name */
5052	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
5053	    NULL, NULL, "admin");
5054
5055	/* XXX --- can support > 1 -- but keep it simple for now */
5056	scctx->isc_intr = IFLIB_INTR_LEGACY;
5057
5058	/* Get memory for the station queues */
5059	if ((err = iflib_queues_alloc(ctx))) {
5060		device_printf(dev, "Unable to allocate queue memory\n");
5061		goto fail_iflib_detach;
5062	}
5063
5064	if ((err = iflib_qset_structures_setup(ctx))) {
5065		device_printf(dev, "qset structure setup failed %d\n", err);
5066		goto fail_queues;
5067	}
5068
5069	/*
5070	 * XXX What if anything do we want to do about interrupts?
5071	 */
5072	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
5073	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
5074		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
5075		goto fail_detach;
5076	}
5077
5078	/*
5079	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
5080	 * This must appear after the call to ether_ifattach() because
5081	 * ether_ifattach() sets if_hdrlen to the default value.
5082	 */
5083	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
5084		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
5085
5086	/* XXX handle more than one queue */
5087	for (i = 0; i < scctx->isc_nrxqsets; i++)
5088		IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl);
5089
5090	*ctxp = ctx;
5091
5092	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
5093	iflib_add_device_sysctl_post(ctx);
5094	ctx->ifc_flags |= IFC_INIT_DONE;
5095	CTX_UNLOCK(ctx);
5096
5097	return (0);
5098fail_detach:
5099	ether_ifdetach(ctx->ifc_ifp);
5100fail_queues:
5101	iflib_tx_structures_free(ctx);
5102	iflib_rx_structures_free(ctx);
5103fail_iflib_detach:
5104	IFDI_DETACH(ctx);
5105fail_unlock:
5106	CTX_UNLOCK(ctx);
5107	iflib_deregister(ctx);
5108fail_ctx_free:
5109	free(ctx->ifc_softc, M_IFLIB);
5110	free(ctx, M_IFLIB);
5111	return (err);
5112}
5113
5114int
5115iflib_pseudo_deregister(if_ctx_t ctx)
5116{
5117	if_t ifp = ctx->ifc_ifp;
5118	iflib_txq_t txq;
5119	iflib_rxq_t rxq;
5120	int i, j;
5121	struct taskqgroup *tqg;
5122	iflib_fl_t fl;
5123
5124	ether_ifdetach(ifp);
5125	/* XXX drain any dependent tasks */
5126	tqg = qgroup_if_io_tqg;
5127	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
5128		callout_drain(&txq->ift_timer);
5129		if (txq->ift_task.gt_uniq != NULL)
5130			taskqgroup_detach(tqg, &txq->ift_task);
5131	}
5132	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
5133		if (rxq->ifr_task.gt_uniq != NULL)
5134			taskqgroup_detach(tqg, &rxq->ifr_task);
5135
5136		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
5137			free(fl->ifl_rx_bitmap, M_IFLIB);
5138	}
5139	tqg = qgroup_if_config_tqg;
5140	if (ctx->ifc_admin_task.gt_uniq != NULL)
5141		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
5142	if (ctx->ifc_vflr_task.gt_uniq != NULL)
5143		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
5144
5145	iflib_tx_structures_free(ctx);
5146	iflib_rx_structures_free(ctx);
5147
5148	iflib_deregister(ctx);
5149
5150	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5151		free(ctx->ifc_softc, M_IFLIB);
5152	free(ctx, M_IFLIB);
5153	return (0);
5154}
5155
5156int
5157iflib_device_attach(device_t dev)
5158{
5159	if_ctx_t ctx;
5160	if_shared_ctx_t sctx;
5161
5162	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
5163		return (ENOTSUP);
5164
5165	pci_enable_busmaster(dev);
5166
5167	return (iflib_device_register(dev, NULL, sctx, &ctx));
5168}
5169
5170int
5171iflib_device_deregister(if_ctx_t ctx)
5172{
5173	if_t ifp = ctx->ifc_ifp;
5174	iflib_txq_t txq;
5175	iflib_rxq_t rxq;
5176	device_t dev = ctx->ifc_dev;
5177	int i, j;
5178	struct taskqgroup *tqg;
5179	iflib_fl_t fl;
5180
5181	/* Make sure VLANS are not using driver */
5182	if (if_vlantrunkinuse(ifp)) {
5183		device_printf(dev, "Vlan in use, detach first\n");
5184		return (EBUSY);
5185	}
5186#ifdef PCI_IOV
5187	if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
5188		device_printf(dev, "SR-IOV in use; detach first.\n");
5189		return (EBUSY);
5190	}
5191#endif
5192
5193	STATE_LOCK(ctx);
5194	ctx->ifc_flags |= IFC_IN_DETACH;
5195	STATE_UNLOCK(ctx);
5196
5197	CTX_LOCK(ctx);
5198	iflib_stop(ctx);
5199	CTX_UNLOCK(ctx);
5200
5201	/* Unregister VLAN events */
5202	if (ctx->ifc_vlan_attach_event != NULL)
5203		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
5204	if (ctx->ifc_vlan_detach_event != NULL)
5205		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
5206
5207	iflib_netmap_detach(ifp);
5208	ether_ifdetach(ifp);
5209	iflib_rem_pfil(ctx);
5210	if (ctx->ifc_led_dev != NULL)
5211		led_destroy(ctx->ifc_led_dev);
5212	/* XXX drain any dependent tasks */
5213	tqg = qgroup_if_io_tqg;
5214	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
5215		callout_drain(&txq->ift_timer);
5216		if (txq->ift_task.gt_uniq != NULL)
5217			taskqgroup_detach(tqg, &txq->ift_task);
5218	}
5219	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
5220		if (rxq->ifr_task.gt_uniq != NULL)
5221			taskqgroup_detach(tqg, &rxq->ifr_task);
5222
5223		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
5224			free(fl->ifl_rx_bitmap, M_IFLIB);
5225	}
5226	tqg = qgroup_if_config_tqg;
5227	if (ctx->ifc_admin_task.gt_uniq != NULL)
5228		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
5229	if (ctx->ifc_vflr_task.gt_uniq != NULL)
5230		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
5231	CTX_LOCK(ctx);
5232	IFDI_DETACH(ctx);
5233	CTX_UNLOCK(ctx);
5234
5235	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
5236	iflib_free_intr_mem(ctx);
5237
5238	bus_generic_detach(dev);
5239
5240	iflib_tx_structures_free(ctx);
5241	iflib_rx_structures_free(ctx);
5242
5243	iflib_deregister(ctx);
5244
5245	device_set_softc(ctx->ifc_dev, NULL);
5246	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5247		free(ctx->ifc_softc, M_IFLIB);
5248	unref_ctx_core_offset(ctx);
5249	free(ctx, M_IFLIB);
5250	return (0);
5251}
5252
5253static void
5254iflib_free_intr_mem(if_ctx_t ctx)
5255{
5256
5257	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
5258		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
5259	}
5260	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
5261		pci_release_msi(ctx->ifc_dev);
5262	}
5263	if (ctx->ifc_msix_mem != NULL) {
5264		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
5265		    rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
5266		ctx->ifc_msix_mem = NULL;
5267	}
5268}
5269
5270int
5271iflib_device_detach(device_t dev)
5272{
5273	if_ctx_t ctx = device_get_softc(dev);
5274
5275	return (iflib_device_deregister(ctx));
5276}
5277
5278int
5279iflib_device_suspend(device_t dev)
5280{
5281	if_ctx_t ctx = device_get_softc(dev);
5282
5283	CTX_LOCK(ctx);
5284	IFDI_SUSPEND(ctx);
5285	CTX_UNLOCK(ctx);
5286
5287	return bus_generic_suspend(dev);
5288}
5289int
5290iflib_device_shutdown(device_t dev)
5291{
5292	if_ctx_t ctx = device_get_softc(dev);
5293
5294	CTX_LOCK(ctx);
5295	IFDI_SHUTDOWN(ctx);
5296	CTX_UNLOCK(ctx);
5297
5298	return bus_generic_suspend(dev);
5299}
5300
5301
5302int
5303iflib_device_resume(device_t dev)
5304{
5305	if_ctx_t ctx = device_get_softc(dev);
5306	iflib_txq_t txq = ctx->ifc_txqs;
5307	int i;
5308
5309	CTX_LOCK(ctx);
5310	IFDI_RESUME(ctx);
5311	iflib_if_init_locked(ctx);
5312	CTX_UNLOCK(ctx);
5313	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
5314		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
5315
5316	return (bus_generic_resume(dev));
5317}
5318
5319int
5320iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
5321{
5322	int error;
5323	if_ctx_t ctx = device_get_softc(dev);
5324
5325	CTX_LOCK(ctx);
5326	error = IFDI_IOV_INIT(ctx, num_vfs, params);
5327	CTX_UNLOCK(ctx);
5328
5329	return (error);
5330}
5331
5332void
5333iflib_device_iov_uninit(device_t dev)
5334{
5335	if_ctx_t ctx = device_get_softc(dev);
5336
5337	CTX_LOCK(ctx);
5338	IFDI_IOV_UNINIT(ctx);
5339	CTX_UNLOCK(ctx);
5340}
5341
5342int
5343iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
5344{
5345	int error;
5346	if_ctx_t ctx = device_get_softc(dev);
5347
5348	CTX_LOCK(ctx);
5349	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
5350	CTX_UNLOCK(ctx);
5351
5352	return (error);
5353}
5354
5355/*********************************************************************
5356 *
5357 *  MODULE FUNCTION DEFINITIONS
5358 *
5359 **********************************************************************/
5360
5361/*
5362 * - Start a fast taskqueue thread for each core
5363 * - Start a taskqueue for control operations
5364 */
5365static int
5366iflib_module_init(void)
5367{
5368	return (0);
5369}
5370
5371static int
5372iflib_module_event_handler(module_t mod, int what, void *arg)
5373{
5374	int err;
5375
5376	switch (what) {
5377	case MOD_LOAD:
5378		if ((err = iflib_module_init()) != 0)
5379			return (err);
5380		break;
5381	case MOD_UNLOAD:
5382		return (EBUSY);
5383	default:
5384		return (EOPNOTSUPP);
5385	}
5386
5387	return (0);
5388}
5389
5390/*********************************************************************
5391 *
5392 *  PUBLIC FUNCTION DEFINITIONS
5393 *     ordered as in iflib.h
5394 *
5395 **********************************************************************/
5396
5397
5398static void
5399_iflib_assert(if_shared_ctx_t sctx)
5400{
5401	int i;
5402
5403	MPASS(sctx->isc_tx_maxsize);
5404	MPASS(sctx->isc_tx_maxsegsize);
5405
5406	MPASS(sctx->isc_rx_maxsize);
5407	MPASS(sctx->isc_rx_nsegments);
5408	MPASS(sctx->isc_rx_maxsegsize);
5409
5410	MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8);
5411	for (i = 0; i < sctx->isc_nrxqs; i++) {
5412		MPASS(sctx->isc_nrxd_min[i]);
5413		MPASS(powerof2(sctx->isc_nrxd_min[i]));
5414		MPASS(sctx->isc_nrxd_max[i]);
5415		MPASS(powerof2(sctx->isc_nrxd_max[i]));
5416		MPASS(sctx->isc_nrxd_default[i]);
5417		MPASS(powerof2(sctx->isc_nrxd_default[i]));
5418	}
5419
5420	MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8);
5421	for (i = 0; i < sctx->isc_ntxqs; i++) {
5422		MPASS(sctx->isc_ntxd_min[i]);
5423		MPASS(powerof2(sctx->isc_ntxd_min[i]));
5424		MPASS(sctx->isc_ntxd_max[i]);
5425		MPASS(powerof2(sctx->isc_ntxd_max[i]));
5426		MPASS(sctx->isc_ntxd_default[i]);
5427		MPASS(powerof2(sctx->isc_ntxd_default[i]));
5428	}
5429}
5430
5431static void
5432_iflib_pre_assert(if_softc_ctx_t scctx)
5433{
5434
5435	MPASS(scctx->isc_txrx->ift_txd_encap);
5436	MPASS(scctx->isc_txrx->ift_txd_flush);
5437	MPASS(scctx->isc_txrx->ift_txd_credits_update);
5438	MPASS(scctx->isc_txrx->ift_rxd_available);
5439	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
5440	MPASS(scctx->isc_txrx->ift_rxd_refill);
5441	MPASS(scctx->isc_txrx->ift_rxd_flush);
5442}
5443
5444static int
5445iflib_register(if_ctx_t ctx)
5446{
5447	if_shared_ctx_t sctx = ctx->ifc_sctx;
5448	driver_t *driver = sctx->isc_driver;
5449	device_t dev = ctx->ifc_dev;
5450	if_t ifp;
5451
5452	_iflib_assert(sctx);
5453
5454	CTX_LOCK_INIT(ctx);
5455	STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
5456	ifp = ctx->ifc_ifp = if_alloc(IFT_ETHER);
5457	if (ifp == NULL) {
5458		device_printf(dev, "can not allocate ifnet structure\n");
5459		return (ENOMEM);
5460	}
5461
5462	/*
5463	 * Initialize our context's device specific methods
5464	 */
5465	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
5466	kobj_class_compile((kobj_class_t) driver);
5467#ifndef __HAIKU__
5468	driver->refs++;
5469#endif
5470
5471	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
5472	if_setsoftc(ifp, ctx);
5473	if_setdev(ifp, dev);
5474	if_setinitfn(ifp, iflib_if_init);
5475	if_setioctlfn(ifp, iflib_if_ioctl);
5476#ifdef ALTQ
5477	if_setstartfn(ifp, iflib_altq_if_start);
5478	if_settransmitfn(ifp, iflib_altq_if_transmit);
5479	if_setsendqready(ifp);
5480#else
5481	if_settransmitfn(ifp, iflib_if_transmit);
5482#endif
5483	if_setqflushfn(ifp, iflib_if_qflush);
5484	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
5485
5486	ctx->ifc_vlan_attach_event =
5487		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
5488							  EVENTHANDLER_PRI_FIRST);
5489	ctx->ifc_vlan_detach_event =
5490		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
5491							  EVENTHANDLER_PRI_FIRST);
5492
5493	if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) {
5494		dprintf("DRVMEDia");
5495		ctx->ifc_mediap = &ctx->ifc_media;
5496		ifmedia_init(ctx->ifc_mediap, IFM_IMASK,
5497		    iflib_media_change, iflib_media_status);
5498	}
5499	return (0);
5500}
5501
5502static void
5503iflib_deregister(if_ctx_t ctx)
5504{
5505	if_t ifp = ctx->ifc_ifp;
5506
5507	/* Remove all media */
5508	ifmedia_removeall(&ctx->ifc_media);
5509
5510	/* Unregister VLAN events */
5511	if (ctx->ifc_vlan_attach_event != NULL) {
5512		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
5513		ctx->ifc_vlan_attach_event = NULL;
5514	}
5515	if (ctx->ifc_vlan_detach_event != NULL) {
5516		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
5517		ctx->ifc_vlan_detach_event = NULL;
5518	}
5519
5520#ifndef __HAIKU__
5521	/* Release kobject reference */
5522	kobj_delete((kobj_t) ctx, NULL);
5523#endif
5524
5525	/* Free the ifnet structure */
5526	if_free(ifp);
5527
5528	STATE_LOCK_DESTROY(ctx);
5529
5530	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
5531	CTX_LOCK_DESTROY(ctx);
5532}
5533
5534static int
5535iflib_queues_alloc(if_ctx_t ctx)
5536{
5537	if_shared_ctx_t sctx = ctx->ifc_sctx;
5538	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5539	device_t dev = ctx->ifc_dev;
5540	int nrxqsets = scctx->isc_nrxqsets;
5541	int ntxqsets = scctx->isc_ntxqsets;
5542	iflib_txq_t txq;
5543	iflib_rxq_t rxq;
5544	iflib_fl_t fl = NULL;
5545	int i, j, cpu, err, txconf, rxconf;
5546	iflib_dma_info_t ifdip;
5547	uint32_t *rxqsizes = scctx->isc_rxqsizes;
5548	uint32_t *txqsizes = scctx->isc_txqsizes;
5549	uint8_t nrxqs = sctx->isc_nrxqs;
5550	uint8_t ntxqs = sctx->isc_ntxqs;
5551	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
5552	caddr_t *vaddrs;
5553	uint64_t *paddrs;
5554
5555	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
5556	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
5557
5558	/* Allocate the TX ring struct memory */
5559	if (!(ctx->ifc_txqs =
5560	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
5561	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
5562		device_printf(dev, "Unable to allocate TX ring memory\n");
5563		err = ENOMEM;
5564		goto fail;
5565	}
5566
5567	/* Now allocate the RX */
5568	if (!(ctx->ifc_rxqs =
5569	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
5570	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
5571		device_printf(dev, "Unable to allocate RX ring memory\n");
5572		err = ENOMEM;
5573		goto rx_fail;
5574	}
5575
5576	txq = ctx->ifc_txqs;
5577	rxq = ctx->ifc_rxqs;
5578
5579	/*
5580	 * XXX handle allocation failure
5581	 */
5582	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
5583		/* Set up some basics */
5584
5585		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
5586		    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
5587			device_printf(dev,
5588			    "Unable to allocate TX DMA info memory\n");
5589			err = ENOMEM;
5590			goto err_tx_desc;
5591		}
5592		txq->ift_ifdi = ifdip;
5593		for (j = 0; j < ntxqs; j++, ifdip++) {
5594			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
5595				device_printf(dev,
5596				    "Unable to allocate TX descriptors\n");
5597				err = ENOMEM;
5598				goto err_tx_desc;
5599			}
5600			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
5601			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
5602		}
5603		txq->ift_ctx = ctx;
5604		txq->ift_id = i;
5605		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
5606			txq->ift_br_offset = 1;
5607		} else {
5608			txq->ift_br_offset = 0;
5609		}
5610#ifndef __HAIKU__
5611		/* XXX fix this */
5612		txq->ift_timer.c_cpu = cpu;
5613#endif
5614
5615		if (iflib_txsd_alloc(txq)) {
5616			device_printf(dev, "Critical Failure setting up TX buffers\n");
5617			err = ENOMEM;
5618			goto err_tx_desc;
5619		}
5620
5621		/* Initialize the TX lock */
5622		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout",
5623		    device_get_nameunit(dev), txq->ift_id);
5624		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
5625		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
5626
5627		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
5628				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
5629		if (err) {
5630			/* XXX free any allocated rings */
5631			device_printf(dev, "Unable to allocate buf_ring\n");
5632			goto err_tx_desc;
5633		}
5634	}
5635
5636	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
5637		/* Set up some basics */
5638
5639		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
5640		   M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
5641			device_printf(dev,
5642			    "Unable to allocate RX DMA info memory\n");
5643			err = ENOMEM;
5644			goto err_tx_desc;
5645		}
5646
5647		rxq->ifr_ifdi = ifdip;
5648		/* XXX this needs to be changed if #rx queues != #tx queues */
5649		rxq->ifr_ntxqirq = 1;
5650		rxq->ifr_txqid[0] = i;
5651		for (j = 0; j < nrxqs; j++, ifdip++) {
5652			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
5653				device_printf(dev,
5654				    "Unable to allocate RX descriptors\n");
5655				err = ENOMEM;
5656				goto err_tx_desc;
5657			}
5658			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
5659		}
5660		rxq->ifr_ctx = ctx;
5661		rxq->ifr_id = i;
5662		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
5663			rxq->ifr_fl_offset = 1;
5664		} else {
5665			rxq->ifr_fl_offset = 0;
5666		}
5667		rxq->ifr_nfl = nfree_lists;
5668		if (!(fl =
5669			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
5670			device_printf(dev, "Unable to allocate free list memory\n");
5671			err = ENOMEM;
5672			goto err_tx_desc;
5673		}
5674		rxq->ifr_fl = fl;
5675		for (j = 0; j < nfree_lists; j++) {
5676			fl[j].ifl_rxq = rxq;
5677			fl[j].ifl_id = j;
5678			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
5679			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
5680		}
5681		/* Allocate receive buffers for the ring */
5682		if (iflib_rxsd_alloc(rxq)) {
5683			device_printf(dev,
5684			    "Critical Failure setting up receive buffers\n");
5685			err = ENOMEM;
5686			goto err_rx_desc;
5687		}
5688
5689		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
5690			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
5691			    M_WAITOK);
5692	}
5693
5694	/* TXQs */
5695	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
5696	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
5697	for (i = 0; i < ntxqsets; i++) {
5698		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
5699
5700		for (j = 0; j < ntxqs; j++, di++) {
5701			vaddrs[i*ntxqs + j] = di->idi_vaddr;
5702			paddrs[i*ntxqs + j] = di->idi_paddr;
5703		}
5704	}
5705	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
5706		device_printf(ctx->ifc_dev,
5707		    "Unable to allocate device TX queue\n");
5708		iflib_tx_structures_free(ctx);
5709		free(vaddrs, M_IFLIB);
5710		free(paddrs, M_IFLIB);
5711		goto err_rx_desc;
5712	}
5713	free(vaddrs, M_IFLIB);
5714	free(paddrs, M_IFLIB);
5715
5716	/* RXQs */
5717	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
5718	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
5719	for (i = 0; i < nrxqsets; i++) {
5720		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
5721
5722		for (j = 0; j < nrxqs; j++, di++) {
5723			vaddrs[i*nrxqs + j] = di->idi_vaddr;
5724			paddrs[i*nrxqs + j] = di->idi_paddr;
5725		}
5726	}
5727	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
5728		device_printf(ctx->ifc_dev,
5729		    "Unable to allocate device RX queue\n");
5730		iflib_tx_structures_free(ctx);
5731		free(vaddrs, M_IFLIB);
5732		free(paddrs, M_IFLIB);
5733		goto err_rx_desc;
5734	}
5735	free(vaddrs, M_IFLIB);
5736	free(paddrs, M_IFLIB);
5737
5738	return (0);
5739
5740/* XXX handle allocation failure changes */
5741err_rx_desc:
5742err_tx_desc:
5743rx_fail:
5744	if (ctx->ifc_rxqs != NULL)
5745		free(ctx->ifc_rxqs, M_IFLIB);
5746	ctx->ifc_rxqs = NULL;
5747	if (ctx->ifc_txqs != NULL)
5748		free(ctx->ifc_txqs, M_IFLIB);
5749	ctx->ifc_txqs = NULL;
5750fail:
5751	return (err);
5752}
5753
5754static int
5755iflib_tx_structures_setup(if_ctx_t ctx)
5756{
5757	iflib_txq_t txq = ctx->ifc_txqs;
5758	int i;
5759
5760	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
5761		iflib_txq_setup(txq);
5762
5763	return (0);
5764}
5765
5766static void
5767iflib_tx_structures_free(if_ctx_t ctx)
5768{
5769	iflib_txq_t txq = ctx->ifc_txqs;
5770	if_shared_ctx_t sctx = ctx->ifc_sctx;
5771	int i, j;
5772
5773	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
5774		iflib_txq_destroy(txq);
5775		for (j = 0; j < sctx->isc_ntxqs; j++)
5776			iflib_dma_free(&txq->ift_ifdi[j]);
5777	}
5778	free(ctx->ifc_txqs, M_IFLIB);
5779	ctx->ifc_txqs = NULL;
5780	IFDI_QUEUES_FREE(ctx);
5781}
5782
5783/*********************************************************************
5784 *
5785 *  Initialize all receive rings.
5786 *
5787 **********************************************************************/
5788static int
5789iflib_rx_structures_setup(if_ctx_t ctx)
5790{
5791	iflib_rxq_t rxq = ctx->ifc_rxqs;
5792	int q;
5793#if defined(INET6) || defined(INET)
5794	int i, err;
5795#endif
5796
5797	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
5798#if defined(INET6) || defined(INET)
5799		if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO) {
5800			err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
5801			    TCP_LRO_ENTRIES, min(1024,
5802			    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]));
5803			if (err != 0) {
5804				device_printf(ctx->ifc_dev,
5805				    "LRO Initialization failed!\n");
5806				goto fail;
5807			}
5808		}
5809#endif
5810		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
5811	}
5812	return (0);
5813#if defined(INET6) || defined(INET)
5814fail:
5815	/*
5816	 * Free LRO resources allocated so far, we will only handle
5817	 * the rings that completed, the failing case will have
5818	 * cleaned up for itself.  'q' failed, so its the terminus.
5819	 */
5820	rxq = ctx->ifc_rxqs;
5821	for (i = 0; i < q; ++i, rxq++) {
5822		if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO)
5823			tcp_lro_free(&rxq->ifr_lc);
5824	}
5825	return (err);
5826#endif
5827}
5828
5829/*********************************************************************
5830 *
5831 *  Free all receive rings.
5832 *
5833 **********************************************************************/
5834static void
5835iflib_rx_structures_free(if_ctx_t ctx)
5836{
5837	iflib_rxq_t rxq = ctx->ifc_rxqs;
5838	int i;
5839
5840	for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
5841		iflib_rx_sds_free(rxq);
5842#if defined(INET6) || defined(INET)
5843		if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO)
5844			tcp_lro_free(&rxq->ifr_lc);
5845#endif
5846	}
5847	free(ctx->ifc_rxqs, M_IFLIB);
5848	ctx->ifc_rxqs = NULL;
5849}
5850
5851static int
5852iflib_qset_structures_setup(if_ctx_t ctx)
5853{
5854	int err;
5855
5856	/*
5857	 * It is expected that the caller takes care of freeing queues if this
5858	 * fails.
5859	 */
5860	if ((err = iflib_tx_structures_setup(ctx)) != 0) {
5861		device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
5862		return (err);
5863	}
5864
5865	if ((err = iflib_rx_structures_setup(ctx)) != 0)
5866		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
5867
5868	return (err);
5869}
5870
5871int
5872iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
5873		driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
5874{
5875
5876	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
5877}
5878
5879#ifdef SMP
5880static int
5881find_nth(if_ctx_t ctx, int qid)
5882{
5883	cpuset_t cpus;
5884	int i, cpuid, eqid, count;
5885
5886	CPU_COPY(&ctx->ifc_cpus, &cpus);
5887	count = CPU_COUNT(&cpus);
5888	eqid = qid % count;
5889	/* clear up to the qid'th bit */
5890	for (i = 0; i < eqid; i++) {
5891		cpuid = CPU_FFS(&cpus);
5892		MPASS(cpuid != 0);
5893		CPU_CLR(cpuid-1, &cpus);
5894	}
5895	cpuid = CPU_FFS(&cpus);
5896	MPASS(cpuid != 0);
5897	return (cpuid-1);
5898}
5899
5900#ifdef SCHED_ULE
5901extern struct cpu_group *cpu_top;              /* CPU topology */
5902
5903static int
5904find_child_with_core(int cpu, struct cpu_group *grp)
5905{
5906	int i;
5907
5908	if (grp->cg_children == 0)
5909		return -1;
5910
5911	MPASS(grp->cg_child);
5912	for (i = 0; i < grp->cg_children; i++) {
5913		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
5914			return i;
5915	}
5916
5917	return -1;
5918}
5919
5920/*
5921 * Find the nth "close" core to the specified core
5922 * "close" is defined as the deepest level that shares
5923 * at least an L2 cache.