diff --git a/.container_build_image b/.container_build_image new file mode 100644 index 0000000000000..787e80d3642d7 --- /dev/null +++ b/.container_build_image @@ -0,0 +1 @@ +rocky-10-kernel-builder diff --git a/.github/workflows/kernel-build-and-test-multiarch-trigger.yml b/.github/workflows/kernel-build-and-test-multiarch-trigger.yml new file mode 100644 index 0000000000000..ae7c4550640f4 --- /dev/null +++ b/.github/workflows/kernel-build-and-test-multiarch-trigger.yml @@ -0,0 +1,11 @@ +name: Trigger Automated kernel build and test (multi-arch) + +on: + push: + branches: + - '*_rlc-10/**' + +jobs: + kernelCI: + uses: ctrliq/kernel-src-tree/.github/workflows/kernel-build-and-test-multiarch-trigger.yml@main + secrets: inherit diff --git a/drivers/infiniband/hw/mana/counters.c b/drivers/infiniband/hw/mana/counters.c index e533ce21013db..e964e74be48da 100644 --- a/drivers/infiniband/hw/mana/counters.c +++ b/drivers/infiniband/hw/mana/counters.c @@ -32,8 +32,32 @@ static const struct rdma_stat_desc mana_ib_port_stats_desc[] = { [MANA_IB_RATE_INC_EVENTS].name = "rate_inc_events", [MANA_IB_NUM_QPS_RECOVERED].name = "num_qps_recovered", [MANA_IB_CURRENT_RATE].name = "current_rate", + [MANA_IB_DUP_RX_REQ].name = "dup_rx_requests", + [MANA_IB_TX_BYTES].name = "tx_bytes", + [MANA_IB_RX_BYTES].name = "rx_bytes", + [MANA_IB_RX_SEND_REQ].name = "rx_send_requests", + [MANA_IB_RX_WRITE_REQ].name = "rx_write_requests", + [MANA_IB_RX_READ_REQ].name = "rx_read_requests", + [MANA_IB_TX_PKT].name = "tx_packets", + [MANA_IB_RX_PKT].name = "rx_packets", }; +static const struct rdma_stat_desc mana_ib_device_stats_desc[] = { + [MANA_IB_SENT_CNPS].name = "sent_cnps", + [MANA_IB_RECEIVED_ECNS].name = "received_ecns", + [MANA_IB_RECEIVED_CNP_COUNT].name = "received_cnp_count", + [MANA_IB_QP_CONGESTED_EVENTS].name = "qp_congested_events", + [MANA_IB_QP_RECOVERED_EVENTS].name = "qp_recovered_events", + [MANA_IB_DEV_RATE_INC_EVENTS].name = "rate_inc_events", +}; + +struct rdma_hw_stats *mana_ib_alloc_hw_device_stats(struct ib_device *ibdev) +{ + return rdma_alloc_hw_stats_struct(mana_ib_device_stats_desc, + ARRAY_SIZE(mana_ib_device_stats_desc), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { @@ -42,8 +66,39 @@ struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, RDMA_HW_STATS_DEFAULT_LIFESPAN); } -int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, - u32 port_num, int index) +static int mana_ib_get_hw_device_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats) +{ + struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, + ib_dev); + struct mana_rnic_query_device_cntrs_resp resp = {}; + struct mana_rnic_query_device_cntrs_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_IB_QUERY_DEVICE_COUNTERS, + sizeof(req), sizeof(resp)); + req.hdr.dev_id = mdev->gdma_dev->dev_id; + req.adapter = mdev->adapter_handle; + + err = mana_gd_send_request(mdev_to_gc(mdev), sizeof(req), &req, + sizeof(resp), &resp); + if (err) { + ibdev_err(&mdev->ib_dev, "Failed to query device counters err %d", + err); + return err; + } + + stats->value[MANA_IB_SENT_CNPS] = resp.sent_cnps; + stats->value[MANA_IB_RECEIVED_ECNS] = resp.received_ecns; + stats->value[MANA_IB_RECEIVED_CNP_COUNT] = resp.received_cnp_count; + stats->value[MANA_IB_QP_CONGESTED_EVENTS] = resp.qp_congested_events; + stats->value[MANA_IB_QP_RECOVERED_EVENTS] = resp.qp_recovered_events; + stats->value[MANA_IB_DEV_RATE_INC_EVENTS] = resp.rate_inc_events; + + return ARRAY_SIZE(mana_ib_device_stats_desc); +} + +static int mana_ib_get_hw_port_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port_num) { struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); @@ -53,6 +108,7 @@ int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, mana_gd_init_req_hdr(&req.hdr, MANA_IB_QUERY_VF_COUNTERS, sizeof(req), sizeof(resp)); + req.hdr.resp.msg_version = GDMA_MESSAGE_V2; req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; @@ -101,5 +157,23 @@ int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, stats->value[MANA_IB_NUM_QPS_RECOVERED] = resp.num_qps_recovered; stats->value[MANA_IB_CURRENT_RATE] = resp.current_rate; + stats->value[MANA_IB_DUP_RX_REQ] = resp.dup_rx_req; + stats->value[MANA_IB_TX_BYTES] = resp.tx_bytes; + stats->value[MANA_IB_RX_BYTES] = resp.rx_bytes; + stats->value[MANA_IB_RX_SEND_REQ] = resp.rx_send_req; + stats->value[MANA_IB_RX_WRITE_REQ] = resp.rx_write_req; + stats->value[MANA_IB_RX_READ_REQ] = resp.rx_read_req; + stats->value[MANA_IB_TX_PKT] = resp.tx_pkt; + stats->value[MANA_IB_RX_PKT] = resp.rx_pkt; + return ARRAY_SIZE(mana_ib_port_stats_desc); } + +int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u32 port_num, int index) +{ + if (!port_num) + return mana_ib_get_hw_device_stats(ibdev, stats); + else + return mana_ib_get_hw_port_stats(ibdev, stats, port_num); +} diff --git a/drivers/infiniband/hw/mana/counters.h b/drivers/infiniband/hw/mana/counters.h index 7ff92d27f6c3a..f68e776bb41d7 100644 --- a/drivers/infiniband/hw/mana/counters.h +++ b/drivers/infiniband/hw/mana/counters.h @@ -35,10 +35,28 @@ enum mana_ib_port_counters { MANA_IB_RATE_INC_EVENTS, MANA_IB_NUM_QPS_RECOVERED, MANA_IB_CURRENT_RATE, + MANA_IB_DUP_RX_REQ, + MANA_IB_TX_BYTES, + MANA_IB_RX_BYTES, + MANA_IB_RX_SEND_REQ, + MANA_IB_RX_WRITE_REQ, + MANA_IB_RX_READ_REQ, + MANA_IB_TX_PKT, + MANA_IB_RX_PKT, +}; + +enum mana_ib_device_counters { + MANA_IB_SENT_CNPS, + MANA_IB_RECEIVED_ECNS, + MANA_IB_RECEIVED_CNP_COUNT, + MANA_IB_QP_CONGESTED_EVENTS, + MANA_IB_QP_RECOVERED_EVENTS, + MANA_IB_DEV_RATE_INC_EVENTS, }; struct rdma_hw_stats *mana_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num); +struct rdma_hw_stats *mana_ib_alloc_hw_device_stats(struct ib_device *ibdev); int mana_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port_num, int index); #endif /* _COUNTERS_H_ */ diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index 28e154bbb50f8..1becc87791235 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -291,6 +291,32 @@ static int mana_process_completions(struct mana_ib_cq *cq, int nwc, struct ib_wc return wc_index; } +void mana_drain_gsi_sqs(struct mana_ib_dev *mdev) +{ + struct mana_ib_qp *qp = mana_get_qp_ref(mdev, MANA_GSI_QPN, false); + struct ud_sq_shadow_wqe *shadow_wqe; + struct mana_ib_cq *cq; + unsigned long flags; + + if (!qp) + return; + + cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq); + + spin_lock_irqsave(&cq->cq_lock, flags); + while ((shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_sq)) + != NULL) { + shadow_wqe->header.error_code = IB_WC_GENERAL_ERR; + shadow_queue_advance_next_to_complete(&qp->shadow_sq); + } + spin_unlock_irqrestore(&cq->cq_lock, flags); + + if (cq->ibcq.comp_handler) + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + + mana_put_qp_ref(qp); +} + int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c index 165c0a1e67d14..eceb6fca4c6aa 100644 --- a/drivers/infiniband/hw/mana/device.c +++ b/drivers/infiniband/hw/mana/device.c @@ -65,6 +65,10 @@ static const struct ib_device_ops mana_ib_stats_ops = { .get_hw_stats = mana_ib_get_hw_stats, }; +static const struct ib_device_ops mana_ib_device_stats_ops = { + .alloc_hw_device_stats = mana_ib_alloc_hw_device_stats, +}; + static int mana_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -153,6 +157,8 @@ static int mana_ib_probe(struct auxiliary_device *adev, } ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); + if (dev->adapter_caps.feature_flags & MANA_IB_FEATURE_DEV_COUNTERS_SUPPORT) + ib_set_device_ops(&dev->ib_dev, &mana_ib_device_stats_ops); ret = mana_ib_create_eqs(dev); if (ret) { @@ -218,6 +224,9 @@ static void mana_ib_remove(struct auxiliary_device *adev) { struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); + if (mana_ib_is_rnic(dev)) + mana_drain_gsi_sqs(dev); + ib_unregister_device(&dev->ib_dev); dma_pool_destroy(dev->av_pool); if (mana_ib_is_rnic(dev)) { diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 42bebd6cd4f7a..38ea58fe411c7 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -43,6 +43,8 @@ */ #define MANA_AV_BUFFER_SIZE 64 +#define MANA_GSI_QPN (1) + struct mana_ib_adapter_caps { u32 max_sq_id; u32 max_rq_id; @@ -210,6 +212,7 @@ enum mana_ib_command_code { MANA_IB_DESTROY_RC_QP = 0x3000b, MANA_IB_SET_QP_STATE = 0x3000d, MANA_IB_QUERY_VF_COUNTERS = 0x30022, + MANA_IB_QUERY_DEVICE_COUNTERS = 0x30023, }; struct mana_ib_query_adapter_caps_req { @@ -218,6 +221,7 @@ struct mana_ib_query_adapter_caps_req { enum mana_ib_adapter_features { MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT = BIT(4), + MANA_IB_FEATURE_DEV_COUNTERS_SUPPORT = BIT(5), }; struct mana_ib_query_adapter_caps_resp { @@ -407,7 +411,7 @@ struct mana_ib_ah_attr { u8 traffic_class; u16 src_port; u16 dest_port; - u32 reserved; + u32 flow_label; }; struct mana_rnic_set_qp_state_req { @@ -424,8 +428,15 @@ struct mana_rnic_set_qp_state_req { u32 retry_cnt; u32 rnr_retry; u32 min_rnr_timer; - u32 reserved; + u32 rate_limit; struct mana_ib_ah_attr ah_attr; + u64 reserved1; + u32 qkey; + u32 qp_access_flags; + u8 local_ack_timeout; + u8 max_rd_atomic; + u16 reserved2; + u32 reserved3; }; /* HW Data */ struct mana_rnic_set_qp_state_resp { @@ -514,6 +525,31 @@ struct mana_rnic_query_vf_cntrs_resp { u64 rate_inc_events; u64 num_qps_recovered; u64 current_rate; + u64 dup_rx_req; + u64 tx_bytes; + u64 rx_bytes; + u64 rx_send_req; + u64 rx_write_req; + u64 rx_read_req; + u64 tx_pkt; + u64 rx_pkt; +}; /* HW Data */ + +struct mana_rnic_query_device_cntrs_req { + struct gdma_req_hdr hdr; + mana_handle_t adapter; +}; /* HW Data */ + +struct mana_rnic_query_device_cntrs_resp { + struct gdma_resp_hdr hdr; + u32 sent_cnps; + u32 received_ecns; + u32 reserved1; + u32 received_cnp_count; + u32 qp_congested_events; + u32 qp_recovered_events; + u32 rate_inc_events; + u32 reserved2; }; /* HW Data */ static inline struct gdma_context *mdev_to_gc(struct mana_ib_dev *mdev) @@ -689,6 +725,7 @@ int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); +void mana_drain_gsi_sqs(struct mana_ib_dev *mdev); int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index a6bf4d539e670..48c1f4977f218 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -735,6 +735,8 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int err; mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp)); + + req.hdr.req.msg_version = GDMA_MESSAGE_V3; req.hdr.dev_id = mdev->gdma_dev->dev_id; req.adapter = mdev->adapter_handle; req.qp_handle = qp->qp_handle; @@ -748,6 +750,12 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, req.retry_cnt = attr->retry_cnt; req.rnr_retry = attr->rnr_retry; req.min_rnr_timer = attr->min_rnr_timer; + req.rate_limit = attr->rate_limit; + req.qkey = attr->qkey; + req.local_ack_timeout = attr->timeout; + req.qp_access_flags = attr->qp_access_flags; + req.max_rd_atomic = attr->max_rd_atomic; + if (attr_mask & IB_QP_AV) { ndev = mana_ib_get_netdev(&mdev->ib_dev, ibqp->port); if (!ndev) { @@ -774,6 +782,7 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, ibqp->qp_num, attr->dest_qp_num); req.ah_attr.traffic_class = attr->ah_attr.grh.traffic_class >> 2; req.ah_attr.hop_limit = attr->ah_attr.grh.hop_limit; + req.ah_attr.flow_label = attr->ah_attr.grh.flow_label; } err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index eae1b6f474e62..db7fda8bcca9d 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -179,6 +179,58 @@ static int idpf_tx_singleq_csum(struct sk_buff *skb, return 1; } +/** + * idpf_tx_singleq_dma_map_error - handle TX DMA map errors + * @txq: queue to send buffer on + * @skb: send buffer + * @first: original first buffer info buffer for packet + * @idx: starting point on ring to unwind + */ +static void idpf_tx_singleq_dma_map_error(struct idpf_tx_queue *txq, + struct sk_buff *skb, + struct idpf_tx_buf *first, u16 idx) +{ + struct libeth_sq_napi_stats ss = { }; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + /* clear dma mappings for failed tx_buf map */ + for (;;) { + struct idpf_tx_buf *tx_buf; + + tx_buf = &txq->tx_buf[idx]; + libeth_tx_complete(tx_buf, &cp); + if (tx_buf == first) + break; + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + if (skb_is_gso(skb)) { + union idpf_tx_flex_desc *tx_desc; + + /* If we failed a DMA mapping for a TSO packet, we will have + * used one additional descriptor for a context + * descriptor. Reset that here. + */ + tx_desc = &txq->flex_tx[idx]; + memset(tx_desc, 0, sizeof(*tx_desc)); + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + /* Update tail in case netdev_xmit_more was previously true */ + idpf_tx_buf_hw_update(txq, idx, false); +} + /** * idpf_tx_singleq_map - Build the Tx base descriptor * @tx_q: queue to send buffer on @@ -219,8 +271,9 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) + return idpf_tx_singleq_dma_map_error(tx_q, skb, + first, i); /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); @@ -362,17 +415,18 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, { struct idpf_tx_offload_params offload = { }; struct idpf_tx_buf *first; - unsigned int count; + u32 count, buf_count = 1; + int csum, tso, needed; __be16 protocol; - int csum, tso; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); - if (idpf_tx_maybe_stop_common(tx_q, - count + IDPF_TX_DESCS_PER_CACHE_LINE + - IDPF_TX_DESCS_FOR_CTX)) { + needed = count + IDPF_TX_DESCS_PER_CACHE_LINE + IDPF_TX_DESCS_FOR_CTX; + if (!netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, + IDPF_DESC_UNUSED(tx_q), + needed, needed)) { idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false); u64_stats_update_begin(&tx_q->stats_sync); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 6a01b3a5b0913..52a6af1bc61fc 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -7,47 +7,12 @@ #include "idpf.h" #include "idpf_virtchnl.h" -struct idpf_tx_stash { - struct hlist_node hlist; - struct libeth_sqe buf; -}; - -#define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) +#define idpf_tx_buf_next(buf) (*(u32 *)&(buf)->priv) LIBETH_SQE_CHECK_PRIV(u32); static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); -/** - * idpf_buf_lifo_push - push a buffer pointer onto stack - * @stack: pointer to stack struct - * @buf: pointer to buf to push - * - * Returns 0 on success, negative on failure - **/ -static int idpf_buf_lifo_push(struct idpf_buf_lifo *stack, - struct idpf_tx_stash *buf) -{ - if (unlikely(stack->top == stack->size)) - return -ENOSPC; - - stack->bufs[stack->top++] = buf; - - return 0; -} - -/** - * idpf_buf_lifo_pop - pop a buffer pointer from stack - * @stack: pointer to stack struct - **/ -static struct idpf_tx_stash *idpf_buf_lifo_pop(struct idpf_buf_lifo *stack) -{ - if (unlikely(!stack->top)) - return NULL; - - return stack->bufs[--stack->top]; -} - /** * idpf_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure @@ -76,52 +41,22 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue) static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { struct libeth_sq_napi_stats ss = { }; - struct idpf_buf_lifo *buf_stack; - struct idpf_tx_stash *stash; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - struct hlist_node *tmp; - u32 i, tag; + u32 i; /* Buffers already cleared, nothing to do */ if (!txq->tx_buf) return; /* Free all the Tx buffer sk_buffs */ - for (i = 0; i < txq->desc_count; i++) + for (i = 0; i < txq->buf_pool_size; i++) libeth_tx_complete(&txq->tx_buf[i], &cp); kfree(txq->tx_buf); txq->tx_buf = NULL; - - if (!idpf_queue_has(FLOW_SCH_EN, txq)) - return; - - buf_stack = &txq->stash->buf_stack; - if (!buf_stack->bufs) - return; - - /* - * If a Tx timeout occurred, there are potentially still bufs in the - * hash table, free them here. - */ - hash_for_each_safe(txq->stash->sched_buf_hash, tag, tmp, stash, - hlist) { - if (!stash) - continue; - - libeth_tx_complete(&stash->buf, &cp); - hash_del(&stash->hlist); - idpf_buf_lifo_push(buf_stack, stash); - } - - for (i = 0; i < buf_stack->size; i++) - kfree(buf_stack->bufs[i]); - - kfree(buf_stack->bufs); - buf_stack->bufs = NULL; } /** @@ -138,6 +73,9 @@ static void idpf_tx_desc_rel(struct idpf_tx_queue *txq) if (!txq->desc_ring) return; + if (txq->refillq) + kfree(txq->refillq->ring); + dmam_free_coherent(txq->dev, txq->size, txq->desc_ring, txq->dma); txq->desc_ring = NULL; txq->next_to_use = 0; @@ -194,41 +132,18 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport) */ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) { - struct idpf_buf_lifo *buf_stack; - int buf_size; - int i; - /* Allocate book keeping buffers only. Buffers to be supplied to HW * are allocated by kernel network stack and received as part of skb */ - buf_size = sizeof(struct idpf_tx_buf) * tx_q->desc_count; - tx_q->tx_buf = kzalloc(buf_size, GFP_KERNEL); + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) + tx_q->buf_pool_size = U16_MAX; + else + tx_q->buf_pool_size = tx_q->desc_count; + tx_q->tx_buf = kcalloc(tx_q->buf_pool_size, sizeof(*tx_q->tx_buf), + GFP_KERNEL); if (!tx_q->tx_buf) return -ENOMEM; - if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) - return 0; - - buf_stack = &tx_q->stash->buf_stack; - - /* Initialize tx buf stack for out-of-order completions if - * flow scheduling offload is enabled - */ - buf_stack->bufs = kcalloc(tx_q->desc_count, sizeof(*buf_stack->bufs), - GFP_KERNEL); - if (!buf_stack->bufs) - return -ENOMEM; - - buf_stack->size = tx_q->desc_count; - buf_stack->top = tx_q->desc_count; - - for (i = 0; i < tx_q->desc_count; i++) { - buf_stack->bufs[i] = kzalloc(sizeof(*buf_stack->bufs[i]), - GFP_KERNEL); - if (!buf_stack->bufs[i]) - return -ENOMEM; - } - return 0; } @@ -243,6 +158,7 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, struct idpf_tx_queue *tx_q) { struct device *dev = tx_q->dev; + struct idpf_sw_queue *refillq; int err; err = idpf_tx_buf_alloc_all(tx_q); @@ -266,6 +182,31 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, tx_q->next_to_clean = 0; idpf_queue_set(GEN_CHK, tx_q); + if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) + return 0; + + refillq = tx_q->refillq; + refillq->desc_count = tx_q->buf_pool_size; + refillq->ring = kcalloc(refillq->desc_count, sizeof(u32), + GFP_KERNEL); + if (!refillq->ring) { + err = -ENOMEM; + goto err_alloc; + } + + for (unsigned int i = 0; i < refillq->desc_count; i++) + refillq->ring[i] = + FIELD_PREP(IDPF_RFL_BI_BUFID_M, i) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, + idpf_queue_has(GEN_CHK, refillq)); + + /* Go ahead and flip the GEN bit since this counts as filling + * up the ring, i.e. we already ring wrapped. + */ + idpf_queue_change(GEN_CHK, refillq); + + tx_q->last_re = tx_q->desc_count - IDPF_TX_SPLITQ_RE_MIN_GAP; + return 0; err_alloc: @@ -316,8 +257,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) for (i = 0; i < vport->num_txq_grp; i++) { for (j = 0; j < vport->txq_grps[i].num_txq; j++) { struct idpf_tx_queue *txq = vport->txq_grps[i].txqs[j]; - u8 gen_bits = 0; - u16 bufidx_mask; err = idpf_tx_desc_alloc(vport, txq); if (err) { @@ -326,34 +265,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) i); goto err_out; } - - if (!idpf_is_queue_model_split(vport->txq_model)) - continue; - - txq->compl_tag_cur_gen = 0; - - /* Determine the number of bits in the bufid - * mask and add one to get the start of the - * generation bits - */ - bufidx_mask = txq->desc_count - 1; - while (bufidx_mask >> 1) { - txq->compl_tag_gen_s++; - bufidx_mask = bufidx_mask >> 1; - } - txq->compl_tag_gen_s++; - - gen_bits = IDPF_TX_SPLITQ_COMPL_TAG_WIDTH - - txq->compl_tag_gen_s; - txq->compl_tag_gen_max = GETMAXVAL(gen_bits); - - /* Set bufid mask based on location of first - * gen bit; it cannot simply be the descriptor - * ring size-1 since we can have size values - * where not all of those bits are set. - */ - txq->compl_tag_bufid_m = - GETMAXVAL(txq->compl_tag_gen_s); } if (!idpf_is_queue_model_split(vport->txq_model)) @@ -602,18 +513,18 @@ static int idpf_rx_hdr_buf_alloc_all(struct idpf_buf_queue *bufq) } /** - * idpf_rx_post_buf_refill - Post buffer id to refill queue + * idpf_post_buf_refill - Post buffer id to refill queue * @refillq: refill queue to post to * @buf_id: buffer id to post */ -static void idpf_rx_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) +static void idpf_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) { u32 nta = refillq->next_to_use; /* store the buffer ID and the SW maintained GEN bit to the refillq */ refillq->ring[nta] = - FIELD_PREP(IDPF_RX_BI_BUFID_M, buf_id) | - FIELD_PREP(IDPF_RX_BI_GEN_M, + FIELD_PREP(IDPF_RFL_BI_BUFID_M, buf_id) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, idpf_queue_has(GEN_CHK, refillq)); if (unlikely(++nta == refillq->desc_count)) { @@ -994,6 +905,11 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) struct idpf_txq_group *txq_grp = &vport->txq_grps[i]; for (j = 0; j < txq_grp->num_txq; j++) { + if (flow_sch_en) { + kfree(txq_grp->txqs[j]->refillq); + txq_grp->txqs[j]->refillq = NULL; + } + kfree(txq_grp->txqs[j]); txq_grp->txqs[j] = NULL; } @@ -1003,9 +919,6 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) kfree(txq_grp->complq); txq_grp->complq = NULL; - - if (flow_sch_en) - kfree(txq_grp->stashes); } kfree(vport->txq_grps); vport->txq_grps = NULL; @@ -1358,7 +1271,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) for (i = 0; i < vport->num_txq_grp; i++) { struct idpf_txq_group *tx_qgrp = &vport->txq_grps[i]; struct idpf_adapter *adapter = vport->adapter; - struct idpf_txq_stash *stashes; int j; tx_qgrp->vport = vport; @@ -1371,15 +1283,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) goto err_alloc; } - if (split && flow_sch_en) { - stashes = kcalloc(num_txq, sizeof(*stashes), - GFP_KERNEL); - if (!stashes) - goto err_alloc; - - tx_qgrp->stashes = stashes; - } - for (j = 0; j < tx_qgrp->num_txq; j++) { struct idpf_tx_queue *q = tx_qgrp->txqs[j]; @@ -1399,12 +1302,14 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) if (!flow_sch_en) continue; - if (split) { - q->stash = &stashes[j]; - hash_init(q->stash->sched_buf_hash); - } - idpf_queue_set(FLOW_SCH_EN, q); + + q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL); + if (!q->refillq) + goto err_alloc; + + idpf_queue_set(GEN_CHK, q->refillq); + idpf_queue_set(RFL_GEN_CHK, q->refillq); } if (!split) @@ -1654,82 +1559,6 @@ static void idpf_tx_handle_sw_marker(struct idpf_tx_queue *tx_q) wake_up(&vport->sw_marker_wq); } -/** - * idpf_tx_clean_stashed_bufs - clean bufs that were stored for - * out of order completions - * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) - * @cleaned: pointer to stats struct to track cleaned packets/bytes - * @budget: Used to determine if we are in netpoll - */ -static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, - u16 compl_tag, - struct libeth_sq_napi_stats *cleaned, - int budget) -{ - struct idpf_tx_stash *stash; - struct hlist_node *tmp_buf; - struct libeth_cq_pp cp = { - .dev = txq->dev, - .ss = cleaned, - .napi = budget, - }; - - /* Buffer completion */ - hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, - hlist, compl_tag) { - if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != compl_tag)) - continue; - - hash_del(&stash->hlist); - libeth_tx_complete(&stash->buf, &cp); - - /* Push shadow buf back onto stack */ - idpf_buf_lifo_push(&txq->stash->buf_stack, stash); - } -} - -/** - * idpf_stash_flow_sch_buffers - store buffer parameters info to be freed at a - * later time (only relevant for flow scheduling mode) - * @txq: Tx queue to clean - * @tx_buf: buffer to store - */ -static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, - struct idpf_tx_buf *tx_buf) -{ - struct idpf_tx_stash *stash; - - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) - return 0; - - stash = idpf_buf_lifo_pop(&txq->stash->buf_stack); - if (unlikely(!stash)) { - net_err_ratelimited("%s: No out-of-order TX buffers left!\n", - netdev_name(txq->netdev)); - - return -ENOMEM; - } - - /* Store buffer params in shadow buffer */ - stash->buf.skb = tx_buf->skb; - stash->buf.bytes = tx_buf->bytes; - stash->buf.packets = tx_buf->packets; - stash->buf.type = tx_buf->type; - stash->buf.nr_frags = tx_buf->nr_frags; - dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); - dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); - idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf); - - /* Add buffer to buf_hash table to be freed later */ - hash_add(txq->stash->sched_buf_hash, &stash->hlist, - idpf_tx_buf_compl_tag(&stash->buf)); - - tx_buf->type = LIBETH_SQE_EMPTY; - - return 0; -} - #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf) \ do { \ if (unlikely(++(ntc) == (txq)->desc_count)) { \ @@ -1757,14 +1586,8 @@ do { \ * Separate packet completion events will be reported on the completion queue, * and the buffers will be cleaned separately. The stats are not updated from * this function when using flow-based scheduling. - * - * Furthermore, in flow scheduling mode, check to make sure there are enough - * reserve buffers to stash the packet. If there are not, return early, which - * will leave next_to_clean pointing to the packet that failed to be stashed. - * - * Return: false in the scenario above, true otherwise. */ -static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, +static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, struct libeth_sq_napi_stats *cleaned, bool descs_only) @@ -1778,7 +1601,12 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, .napi = napi_budget, }; struct idpf_tx_buf *tx_buf; - bool clean_complete = true; + + if (descs_only) { + /* Bump ring index to mark as cleaned. */ + tx_q->next_to_clean = end; + return; + } tx_desc = &tx_q->flex_tx[ntc]; next_pending_desc = &tx_q->flex_tx[end]; @@ -1798,132 +1626,58 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, break; eop_idx = tx_buf->rs_idx; + libeth_tx_complete(tx_buf, &cp); - if (descs_only) { - if (IDPF_TX_BUF_RSV_UNUSED(tx_q) < tx_buf->nr_frags) { - clean_complete = false; - goto tx_splitq_clean_out; - } - - idpf_stash_flow_sch_buffers(tx_q, tx_buf); + /* unmap remaining buffers */ + while (ntc != eop_idx) { + idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, + tx_desc, tx_buf); - while (ntc != eop_idx) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - idpf_stash_flow_sch_buffers(tx_q, tx_buf); - } - } else { + /* unmap any remaining paged data */ libeth_tx_complete(tx_buf, &cp); - - /* unmap remaining buffers */ - while (ntc != eop_idx) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - - /* unmap any remaining paged data */ - libeth_tx_complete(tx_buf, &cp); - } } fetch_next_txq_desc: idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); } -tx_splitq_clean_out: tx_q->next_to_clean = ntc; - - return clean_complete; } -#define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf) \ -do { \ - (buf)++; \ - (ntc)++; \ - if (unlikely((ntc) == (txq)->desc_count)) { \ - buf = (txq)->tx_buf; \ - ntc = 0; \ - } \ -} while (0) - /** - * idpf_tx_clean_buf_ring - clean flow scheduling TX queue buffers + * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) + * @buf_id: packet's starting buffer ID, from completion descriptor * @cleaned: pointer to stats struct to track cleaned packets/bytes * @budget: Used to determine if we are in netpoll * - * Cleans all buffers associated with the input completion tag either from the - * TX buffer ring or from the hash table if the buffers were previously - * stashed. Returns the byte/segment count for the cleaned packet associated - * this completion tag. + * Clean all buffers associated with the packet starting at buf_id. Returns the + * byte/segment count for the cleaned packet. */ -static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, - struct libeth_sq_napi_stats *cleaned, - int budget) +static void idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, + struct libeth_sq_napi_stats *cleaned, + int budget) { - u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = cleaned, .napi = budget, }; - u16 ntc, orig_idx = idx; - tx_buf = &txq->tx_buf[idx]; - - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX || - idpf_tx_buf_compl_tag(tx_buf) != compl_tag)) - return false; - - if (tx_buf->type == LIBETH_SQE_SKB) + tx_buf = &txq->tx_buf[buf_id]; + if (tx_buf->type == LIBETH_SQE_SKB) { libeth_tx_complete(tx_buf, &cp); + idpf_post_buf_refill(txq->refillq, buf_id); + } - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + buf_id = idpf_tx_buf_next(tx_buf); - while (idpf_tx_buf_compl_tag(tx_buf) == compl_tag) { + tx_buf = &txq->tx_buf[buf_id]; libeth_tx_complete(tx_buf, &cp); - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + idpf_post_buf_refill(txq->refillq, buf_id); } - - /* - * It's possible the packet we just cleaned was an out of order - * completion, which means we can stash the buffers starting from - * the original next_to_clean and reuse the descriptors. We need - * to compare the descriptor ring next_to_clean packet's "first" buffer - * to the "first" buffer of the packet we just cleaned to determine if - * this is the case. Howevever, next_to_clean can point to either a - * reserved buffer that corresponds to a context descriptor used for the - * next_to_clean packet (TSO packet) or the "first" buffer (single - * packet). The orig_idx from the packet we just cleaned will always - * point to the "first" buffer. If next_to_clean points to a reserved - * buffer, let's bump ntc once and start the comparison from there. - */ - ntc = txq->next_to_clean; - tx_buf = &txq->tx_buf[ntc]; - - if (tx_buf->type == LIBETH_SQE_CTX) - idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf); - - /* - * If ntc still points to a different "first" buffer, clean the - * descriptor ring and stash all of the buffers for later cleaning. If - * we cannot stash all of the buffers, next_to_clean will point to the - * "first" buffer of the packet that could not be stashed and cleaning - * will start there next time. - */ - if (unlikely(tx_buf != &txq->tx_buf[orig_idx] && - !idpf_tx_splitq_clean(txq, orig_idx, budget, cleaned, - true))) - return true; - - /* - * Otherwise, update next_to_clean to reflect the cleaning that was - * done above. - */ - txq->next_to_clean = idx; - - return true; } /** @@ -1942,22 +1696,17 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, struct libeth_sq_napi_stats *cleaned, int budget) { - u16 compl_tag; + /* RS completion contains queue head for queue based scheduling or + * completion tag for flow based scheduling. + */ + u16 rs_compl_val = le16_to_cpu(desc->q_head_compl_tag.q_head); if (!idpf_queue_has(FLOW_SCH_EN, txq)) { - u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head); - - idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + idpf_tx_splitq_clean(txq, rs_compl_val, budget, cleaned, false); return; } - compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); - - /* If we didn't clean anything on the ring, this packet must be - * in the hash table. Go clean it there. - */ - if (!idpf_tx_clean_buf_ring(txq, compl_tag, cleaned, budget)) - idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget); + idpf_tx_clean_bufs(txq, rs_compl_val, cleaned, budget); } /** @@ -2074,8 +1823,7 @@ static bool idpf_tx_clean_complq(struct idpf_compl_queue *complq, int budget, /* Update BQL */ nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - dont_wake = !complq_ok || IDPF_TX_BUF_RSV_LOW(tx_q) || - np->state != __IDPF_VPORT_UP || + dont_wake = !complq_ok || np->state != __IDPF_VPORT_UP || !netif_carrier_ok(tx_q->netdev); /* Check if the TXQ needs to and can be restarted */ __netif_txq_completed_wake(nq, tx_q->cleaned_pkts, tx_q->cleaned_bytes, @@ -2132,39 +1880,47 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc, desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag); } +/** + * idpf_tx_splitq_has_room - check if enough Tx splitq resources are available + * @tx_q: the queue to be checked + * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of Tx buffers required for this packet + * + * Return: 0 if no room available, 1 otherwise + */ +static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed, + u32 bufs_needed) +{ + if (IDPF_DESC_UNUSED(tx_q) < descs_needed || + IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > + IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || + idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed) + return 0; + return 1; +} + /** * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions * @tx_q: the queue to be checked * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of buffers needed for this packet * - * Returns 0 if stop is not needed + * Return: 0 if stop is not needed */ static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q, - unsigned int descs_needed) + u32 descs_needed, + u32 bufs_needed) { - if (idpf_tx_maybe_stop_common(tx_q, descs_needed)) - goto out; - - /* If there are too many outstanding completions expected on the - * completion queue, stop the TX queue to give the device some time to - * catch up - */ - if (unlikely(IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > - IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq))) - goto splitq_stop; - - /* Also check for available book keeping buffers; if we are low, stop - * the queue to wait for more completions + /* Since we have multiple resources to check for splitq, our + * start,stop_thrs becomes a boolean check instead of a count + * threshold. */ - if (unlikely(IDPF_TX_BUF_RSV_LOW(tx_q))) - goto splitq_stop; - - return 0; - -splitq_stop: - netif_stop_subqueue(tx_q->netdev, tx_q->idx); + if (netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, + idpf_txq_has_room(tx_q, descs_needed, + bufs_needed), + 1, 1)) + return 0; -out: u64_stats_update_begin(&tx_q->stats_sync); u64_stats_inc(&tx_q->q_stats.q_busy); u64_stats_update_end(&tx_q->stats_sync); @@ -2190,12 +1946,6 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); tx_q->next_to_use = val; - if (idpf_tx_maybe_stop_common(tx_q, IDPF_TX_DESC_NEEDED)) { - u64_stats_update_begin(&tx_q->stats_sync); - u64_stats_inc(&tx_q->q_stats.q_busy); - u64_stats_update_end(&tx_q->stats_sync); - } - /* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. (Only * applicable for weak-ordered memory model archs, @@ -2209,14 +1959,16 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, } /** - * idpf_tx_desc_count_required - calculate number of Tx descriptors needed + * idpf_tx_res_count_required - get number of Tx resources needed for this pkt * @txq: queue to send buffer on * @skb: send buffer + * @bufs_needed: (output) number of buffers needed for this skb. * - * Returns number of data descriptors needed for this skb. + * Return: number of data descriptors and buffers needed for this skb. */ -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb) +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, + u32 *bufs_needed) { const struct skb_shared_info *shinfo; unsigned int count = 0, i; @@ -2227,6 +1979,7 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, return count; shinfo = skb_shinfo(skb); + *bufs_needed += shinfo->nr_frags; for (i = 0; i < shinfo->nr_frags; i++) { unsigned int size; @@ -2256,71 +2009,89 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, } /** - * idpf_tx_dma_map_error - handle TX DMA map errors - * @txq: queue to send buffer on - * @skb: send buffer - * @first: original first buffer info buffer for packet - * @idx: starting point on ring to unwind + * idpf_tx_splitq_bump_ntu - adjust NTU and generation + * @txq: the tx ring to wrap + * @ntu: ring index to bump */ -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 idx) +static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) { - struct libeth_sq_napi_stats ss = { }; - struct libeth_cq_pp cp = { - .dev = txq->dev, - .ss = &ss, - }; + ntu++; - u64_stats_update_begin(&txq->stats_sync); - u64_stats_inc(&txq->q_stats.dma_map_errs); - u64_stats_update_end(&txq->stats_sync); + if (ntu == txq->desc_count) + ntu = 0; - /* clear dma mappings for failed tx_buf map */ - for (;;) { - struct idpf_tx_buf *tx_buf; + return ntu; +} - tx_buf = &txq->tx_buf[idx]; - libeth_tx_complete(tx_buf, &cp); - if (tx_buf == first) - break; - if (idx == 0) - idx = txq->desc_count; - idx--; - } +/** + * idpf_tx_get_free_buf_id - get a free buffer ID from the refill queue + * @refillq: refill queue to get buffer ID from + * @buf_id: return buffer ID + * + * Return: true if a buffer ID was found, false if not + */ +static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, + u32 *buf_id) +{ + u32 ntc = refillq->next_to_clean; + u32 refill_desc; - if (skb_is_gso(skb)) { - union idpf_tx_flex_desc *tx_desc; + refill_desc = refillq->ring[ntc]; - /* If we failed a DMA mapping for a TSO packet, we will have - * used one additional descriptor for a context - * descriptor. Reset that here. - */ - tx_desc = &txq->flex_tx[idx]; - memset(tx_desc, 0, sizeof(struct idpf_flex_tx_ctx_desc)); - if (idx == 0) - idx = txq->desc_count; - idx--; + if (unlikely(idpf_queue_has(RFL_GEN_CHK, refillq) != + !!(refill_desc & IDPF_RFL_BI_GEN_M))) + return false; + + *buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); + + if (unlikely(++ntc == refillq->desc_count)) { + idpf_queue_change(RFL_GEN_CHK, refillq); + ntc = 0; } - /* Update tail in case netdev_xmit_more was previously true */ - idpf_tx_buf_hw_update(txq, idx, false); + refillq->next_to_clean = ntc; + + return true; } /** - * idpf_tx_splitq_bump_ntu - adjust NTU and generation - * @txq: the tx ring to wrap - * @ntu: ring index to bump + * idpf_tx_splitq_pkt_err_unmap - Unmap buffers and bump tail in case of error + * @txq: Tx queue to unwind + * @params: pointer to splitq params struct + * @first: starting buffer for packet to unmap */ -static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) +static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq, + struct idpf_tx_splitq_params *params, + struct idpf_tx_buf *first) { - ntu++; + struct idpf_sw_queue *refillq = txq->refillq; + struct libeth_sq_napi_stats ss = { }; + struct idpf_tx_buf *tx_buf = first; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; - if (ntu == txq->desc_count) { - ntu = 0; - txq->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(txq); + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + libeth_tx_complete(tx_buf, &cp); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + tx_buf = &txq->tx_buf[idpf_tx_buf_next(tx_buf)]; + libeth_tx_complete(tx_buf, &cp); } - return ntu; + /* Update tail in case netdev_xmit_more was previously true. */ + idpf_tx_buf_hw_update(txq, params->prev_ntu, false); + + if (!refillq) + return; + + /* Restore refillq state to avoid leaking tags. */ + if (params->prev_refill_gen != idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = params->prev_refill_ntc; } /** @@ -2344,6 +2115,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, struct netdev_queue *nq; struct sk_buff *skb; skb_frag_t *frag; + u32 next_buf_id; u16 td_cmd = 0; dma_addr_t dma; @@ -2361,17 +2133,16 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, tx_buf = first; first->nr_frags = 0; - params->compl_tag = - (tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i; - for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); + } first->nr_frags++; - idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; tx_buf->type = LIBETH_SQE_FRAG; /* record length, and DMA address */ @@ -2427,29 +2198,12 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, max_data); if (unlikely(++i == tx_q->desc_count)) { - tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = - IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { - tx_buf++; tx_desc++; } - /* Since this packet has a buffer that is going to span - * multiple descriptors, it's going to leave holes in - * to the TX buffer ring. To ensure these holes do not - * cause issues in the cleaning routines, we will clear - * them of any stale data and assign them the same - * completion tag as the current packet. Then when the - * packet is being cleaned, the cleaning routines will - * simply pass over these holes and finish cleaning the - * rest of the packet. - */ - tx_buf->type = LIBETH_SQE_EMPTY; - idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; - /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, * max_data will be >= 12K and <= 16K-1. On any @@ -2474,15 +2228,25 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); if (unlikely(++i == tx_q->desc_count)) { - tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { - tx_buf++; tx_desc++; } + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &next_buf_id))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); + } + } else { + next_buf_id = i; + } + idpf_tx_buf_next(tx_buf) = next_buf_id; + tx_buf = &tx_q->tx_buf[next_buf_id]; + size = skb_frag_size(frag); data_len -= size; @@ -2497,6 +2261,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, /* write last descriptor with RS and EOP bits */ first->rs_idx = i; + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; td_cmd |= params->eop_cmd; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); i = idpf_tx_splitq_bump_ntu(tx_q, i); @@ -2705,8 +2470,6 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_flex_tx_ctx_desc *desc; int i = txq->next_to_use; - txq->tx_buf[i].type = LIBETH_SQE_CTX; - /* grab the next descriptor */ desc = &txq->flex_ctx[i]; txq->next_to_use = idpf_tx_splitq_bump_ntu(txq, i); @@ -2732,6 +2495,21 @@ netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb) return NETDEV_TX_OK; } +/** + * idpf_tx_splitq_need_re - check whether RE bit needs to be set + * @tx_q: pointer to Tx queue + * + * Return: true if RE bit needs to be set, false otherwise + */ +static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q) +{ + int gap = tx_q->next_to_use - tx_q->last_re; + + gap += (gap < 0) ? tx_q->desc_count : 0; + + return gap >= IDPF_TX_SPLITQ_RE_MIN_GAP; +} + /** * idpf_tx_splitq_frame - Sends buffer on Tx ring using flex descriptors * @skb: send buffer @@ -2742,12 +2520,15 @@ netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb) static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q) { - struct idpf_tx_splitq_params tx_params = { }; + struct idpf_tx_splitq_params tx_params = { + .prev_ntu = tx_q->next_to_use, + }; struct idpf_tx_buf *first; - unsigned int count; + u32 count, buf_count = 1; int tso; + u32 buf_id; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); @@ -2757,7 +2538,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, /* Check for splitq specific TX resources */ count += (IDPF_TX_DESCS_PER_CACHE_LINE + tso); - if (idpf_tx_maybe_stop_splitq(tx_q, count)) { + if (idpf_tx_maybe_stop_splitq(tx_q, count, buf_count)) { idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false); return NETDEV_TX_BUSY; @@ -2784,36 +2565,47 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, u64_stats_update_end(&tx_q->stats_sync); } - /* record the location of the first descriptor for this packet */ - first = &tx_q->tx_buf[tx_q->next_to_use]; - first->skb = skb; + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + struct idpf_sw_queue *refillq = tx_q->refillq; - if (tso) { - first->packets = tx_params.offload.tso_segs; - first->bytes = skb->len + - ((first->packets - 1) * tx_params.offload.tso_hdr_len); - } else { - first->packets = 1; - first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); - } + /* Save refillq state in case of a packet rollback. Otherwise, + * the tags will be leaked since they will be popped from the + * refillq but never reposted during cleaning. + */ + tx_params.prev_refill_gen = + idpf_queue_has(RFL_GEN_CHK, refillq); + tx_params.prev_refill_ntc = refillq->next_to_clean; + + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &buf_id))) { + if (tx_params.prev_refill_gen != + idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = tx_params.prev_refill_ntc; + + tx_q->next_to_use = tx_params.prev_ntu; + return idpf_tx_drop_skb(tx_q, skb); + } + tx_params.compl_tag = buf_id; - if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; - /* Set the RE bit to catch any packets that may have not been - * stashed during RS completion cleaning. MIN_GAP is set to - * MIN_RING size to ensure it will be set at least once each - * time around the ring. + /* Set the RE bit to periodically "clean" the descriptor ring. + * MIN_GAP is set to MIN_RING size to ensure it will be set at + * least once each time around the ring. */ - if (!(tx_q->next_to_use % IDPF_TX_SPLITQ_RE_MIN_GAP)) { + if (idpf_tx_splitq_need_re(tx_q)) { tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE; tx_q->txq_grp->num_completions_pending++; + tx_q->last_re = tx_q->next_to_use; } if (skb->ip_summed == CHECKSUM_PARTIAL) tx_params.offload.td_cmd |= IDPF_TXD_FLEX_FLOW_CMD_CS_EN; } else { + buf_id = tx_q->next_to_use; + tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2; tx_params.eop_cmd = IDPF_TXD_LAST_DESC_CMD; @@ -2821,6 +2613,18 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.offload.td_cmd |= IDPF_TX_FLEX_DESC_CMD_CS_EN; } + first = &tx_q->tx_buf[buf_id]; + first->skb = skb; + + if (tso) { + first->packets = tx_params.offload.tso_segs; + first->bytes = skb->len + + ((first->packets - 1) * tx_params.offload.tso_hdr_len); + } else { + first->packets = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + } + idpf_tx_splitq_map(tx_q, &tx_params, first); return NETDEV_TX_OK; @@ -3320,7 +3124,7 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) skip_data: rx_buf->page = NULL; - idpf_rx_post_buf_refill(refillq, buf_id); + idpf_post_buf_refill(refillq, buf_id); IDPF_RX_BUMP_NTC(rxq, ntc); /* skip if it is non EOP desc */ @@ -3428,10 +3232,10 @@ static void idpf_rx_clean_refillq(struct idpf_buf_queue *bufq, bool failure; if (idpf_queue_has(RFL_GEN_CHK, refillq) != - !!(refill_desc & IDPF_RX_BI_GEN_M)) + !!(refill_desc & IDPF_RFL_BI_GEN_M)) break; - buf_id = FIELD_GET(IDPF_RX_BI_BUFID_M, refill_desc); + buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); failure = idpf_rx_update_bufq_desc(bufq, buf_id, buf_desc); if (failure) break; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index cd9a20c9cc7fb..f3378e72d6afd 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -107,8 +107,8 @@ do { \ */ #define IDPF_TX_SPLITQ_RE_MIN_GAP 64 -#define IDPF_RX_BI_GEN_M BIT(16) -#define IDPF_RX_BI_BUFID_M GENMASK(15, 0) +#define IDPF_RFL_BI_GEN_M BIT(16) +#define IDPF_RFL_BI_BUFID_M GENMASK(15, 0) #define IDPF_RXD_EOF_SPLITQ VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_EOF_M #define IDPF_RXD_EOF_SINGLEQ VIRTCHNL2_RX_BASE_DESC_STATUS_EOF_M @@ -117,10 +117,6 @@ do { \ ((((txq)->next_to_clean > (txq)->next_to_use) ? 0 : (txq)->desc_count) + \ (txq)->next_to_clean - (txq)->next_to_use - 1) -#define IDPF_TX_BUF_RSV_UNUSED(txq) ((txq)->stash->buf_stack.top) -#define IDPF_TX_BUF_RSV_LOW(txq) (IDPF_TX_BUF_RSV_UNUSED(txq) < \ - (txq)->desc_count >> 2) - #define IDPF_TX_COMPLQ_OVERFLOW_THRESH(txcq) ((txcq)->desc_count >> 1) /* Determine the absolute number of completions pending, i.e. the number of * completions that are expected to arrive on the TX completion queue. @@ -130,11 +126,7 @@ do { \ 0 : U32_MAX) + \ (txq)->num_completions_pending - (txq)->complq->num_completions) -#define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 -/* Adjust the generation for the completion tag and wrap if necessary */ -#define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \ - ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ - 0 : (txq)->compl_tag_cur_gen) +#define IDPF_TXBUF_NULL U32_MAX #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS) @@ -150,18 +142,6 @@ union idpf_tx_flex_desc { #define idpf_tx_buf libeth_sqe -/** - * struct idpf_buf_lifo - LIFO for managing OOO completions - * @top: Used to know how many buffers are left - * @size: Total size of LIFO - * @bufs: Backing array - */ -struct idpf_buf_lifo { - u16 top; - u16 size; - struct idpf_tx_stash **bufs; -}; - /** * struct idpf_tx_offload_params - Offload parameters for a given packet * @tx_flags: Feature flags enabled for this packet @@ -194,6 +174,9 @@ struct idpf_tx_offload_params { * @compl_tag: Associated tag for completion * @td_tag: Descriptor tunneling tag * @offload: Offload parameters + * @prev_ntu: stored TxQ next_to_use in case of rollback + * @prev_refill_ntc: stored refillq next_to_clean in case of packet rollback + * @prev_refill_gen: stored refillq generation bit in case of packet rollback */ struct idpf_tx_splitq_params { enum idpf_tx_desc_dtype_value dtype; @@ -204,6 +187,10 @@ struct idpf_tx_splitq_params { }; struct idpf_tx_offload_params offload; + + u16 prev_ntu; + u16 prev_refill_ntc; + bool prev_refill_gen; }; enum idpf_tx_ctx_desc_eipt_offload { @@ -463,17 +450,6 @@ struct idpf_tx_queue_stats { #define IDPF_ITR_IDX_SPACING(spacing, dflt) (spacing ? spacing : dflt) #define IDPF_DIM_DEFAULT_PROFILE_IX 1 -/** - * struct idpf_txq_stash - Tx buffer stash for Flow-based scheduling mode - * @buf_stack: Stack of empty buffers to store buffer info for out of order - * buffer completions. See struct idpf_buf_lifo - * @sched_buf_hash: Hash table to store buffers - */ -struct idpf_txq_stash { - struct idpf_buf_lifo buf_stack; - DECLARE_HASHTABLE(sched_buf_hash, 12); -} ____cacheline_aligned; - /** * struct idpf_rx_queue - software structure representing a receive queue * @rx: universal receive descriptor array @@ -604,6 +580,8 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64, * @netdev: &net_device corresponding to this queue * @next_to_use: Next descriptor to use * @next_to_clean: Next descriptor to clean + * @last_re: last descriptor index that RE bit was set + * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @cleaned_bytes: Splitq only, TXQ only: When a TX completion is received on * the TX completion queue, it can be for any TXQ associated * with that completion queue. This means we can clean up to @@ -614,17 +592,14 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64, * only once at the end of the cleaning routine. * @clean_budget: singleq only, queue cleaning budget * @cleaned_pkts: Number of packets cleaned for the above said case - * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather - * @stash: Tx buffer stash for Flow-based scheduling mode - * @compl_tag_bufid_m: Completion tag buffer id mask - * @compl_tag_cur_gen: Used to keep track of current completion tag generation - * @compl_tag_gen_max: To determine when compl_tag_cur_gen should be reset + * @refillq: Pointer to refill queue * @stats_sync: See struct u64_stats_sync * @q_stats: See union idpf_tx_queue_stats * @q_id: Queue id * @size: Length of descriptor ring in bytes * @dma: Physical address of ring * @q_vector: Backreference to associated vector + * @buf_pool_size: Total number of idpf_tx_buf */ struct idpf_tx_queue { __cacheline_group_begin_aligned(read_mostly); @@ -646,7 +621,6 @@ struct idpf_tx_queue { u16 desc_count; u16 tx_min_pkt_len; - u16 compl_tag_gen_s; struct net_device *netdev; __cacheline_group_end_aligned(read_mostly); @@ -654,6 +628,8 @@ struct idpf_tx_queue { __cacheline_group_begin_aligned(read_write); u16 next_to_use; u16 next_to_clean; + u16 last_re; + u16 tx_max_bufs; union { u32 cleaned_bytes; @@ -661,12 +637,7 @@ struct idpf_tx_queue { }; u16 cleaned_pkts; - u16 tx_max_bufs; - struct idpf_txq_stash *stash; - - u16 compl_tag_bufid_m; - u16 compl_tag_cur_gen; - u16 compl_tag_gen_max; + struct idpf_sw_queue *refillq; struct u64_stats_sync stats_sync; struct idpf_tx_queue_stats q_stats; @@ -678,11 +649,12 @@ struct idpf_tx_queue { dma_addr_t dma; struct idpf_q_vector *q_vector; + u32 buf_pool_size; __cacheline_group_end_aligned(cold); }; libeth_cacheline_set_assert(struct idpf_tx_queue, 64, - 88 + sizeof(struct u64_stats_sync), - 24); + 80 + sizeof(struct u64_stats_sync), + 32); /** * struct idpf_buf_queue - software structure representing a buffer queue @@ -892,7 +864,6 @@ struct idpf_rxq_group { * @vport: Vport back pointer * @num_txq: Number of TX queues associated * @txqs: Array of TX queue pointers - * @stashes: array of OOO stashes for the queues * @complq: Associated completion queue pointer, split queue only * @num_completions_pending: Total number of completions pending for the * completion queue, acculumated for all TX queues @@ -907,7 +878,6 @@ struct idpf_txq_group { u16 num_txq; struct idpf_tx_queue *txqs[IDPF_LARGE_MAX_Q]; - struct idpf_txq_stash *stashes; struct idpf_compl_queue *complq; @@ -1000,6 +970,17 @@ static inline void idpf_vport_intr_set_wb_on_itr(struct idpf_q_vector *q_vector) reg->dyn_ctl); } +/** + * idpf_tx_splitq_get_free_bufs - get number of free buf_ids in refillq + * @refillq: pointer to refillq containing buf_ids + */ +static inline u32 idpf_tx_splitq_get_free_bufs(struct idpf_sw_queue *refillq) +{ + return (refillq->next_to_use > refillq->next_to_clean ? + 0 : refillq->desc_count) + + refillq->next_to_use - refillq->next_to_clean - 1; +} + int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget); void idpf_vport_init_num_qs(struct idpf_vport *vport, struct virtchnl2_create_vport *vport_msg); @@ -1027,10 +1008,8 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, bool xmit_more); unsigned int idpf_size_to_txd_count(unsigned int size); netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb); -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 ring_idx); -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb); +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, u32 *buf_count); void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue); netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q); @@ -1039,12 +1018,4 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rxq, u16 cleaned_count); int idpf_tso(struct sk_buff *skb, struct idpf_tx_offload_params *off); -static inline bool idpf_tx_maybe_stop_common(struct idpf_tx_queue *tx_q, - u32 needed) -{ - return !netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, - IDPF_DESC_UNUSED(tx_q), - needed, needed); -} - #endif /* !_IDPF_TXRX_H_ */ diff --git a/drivers/net/ethernet/microsoft/Kconfig b/drivers/net/ethernet/microsoft/Kconfig index 901fbffbf718e..3f36ee6a8ecee 100644 --- a/drivers/net/ethernet/microsoft/Kconfig +++ b/drivers/net/ethernet/microsoft/Kconfig @@ -22,6 +22,7 @@ config MICROSOFT_MANA depends on PCI_HYPERV select AUXILIARY_BUS select PAGE_POOL + select NET_SHAPER help This driver supports Microsoft Azure Network Adapter (MANA). So far, the driver is only supported on X86_64. diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 7cb9360c05156..a468cd8e5f361 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include @@ -102,8 +104,15 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev) return err ? err : -EPROTO; } - if (gc->num_msix_usable > resp.max_msix) - gc->num_msix_usable = resp.max_msix; + if (!pci_msix_can_alloc_dyn(pdev)) { + if (gc->num_msix_usable > resp.max_msix) + gc->num_msix_usable = resp.max_msix; + } else { + /* If dynamic allocation is enabled we have already allocated + * hwc msi + */ + gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1); + } if (gc->num_msix_usable <= 1) return -ENOSPC; @@ -637,7 +646,9 @@ static int mana_gd_register_irq(struct gdma_queue *queue, } queue->eq.msix_index = msi_index; - gic = &gc->irq_contexts[msi_index]; + gic = xa_load(&gc->irq_contexts, msi_index); + if (WARN_ON(!gic)) + return -EINVAL; spin_lock_irqsave(&gic->lock, flags); list_add_rcu(&queue->entry, &gic->eq_list); @@ -662,7 +673,10 @@ static void mana_gd_deregiser_irq(struct gdma_queue *queue) if (WARN_ON(msix_index >= gc->num_msix_usable)) return; - gic = &gc->irq_contexts[msix_index]; + gic = xa_load(&gc->irq_contexts, msix_index); + if (WARN_ON(!gic)) + return; + spin_lock_irqsave(&gic->lock, flags); list_for_each_entry_rcu(eq, &gic->eq_list, entry) { if (queue == eq) { @@ -1445,7 +1459,49 @@ void mana_gd_free_res_map(struct gdma_resource *r) r->size = 0; } -static int irq_setup(unsigned int *irqs, unsigned int len, int node) +/* + * Spread on CPUs with the following heuristics: + * + * 1. No more than one IRQ per CPU, if possible; + * 2. NUMA locality is the second priority; + * 3. Sibling dislocality is the last priority. + * + * Let's consider this topology: + * + * Node 0 1 + * Core 0 1 2 3 + * CPU 0 1 2 3 4 5 6 7 + * + * The most performant IRQ distribution based on the above topology + * and heuristics may look like this: + * + * IRQ Nodes Cores CPUs + * 0 1 0 0-1 + * 1 1 1 2-3 + * 2 1 0 0-1 + * 3 1 1 2-3 + * 4 2 2 4-5 + * 5 2 3 6-7 + * 6 2 2 4-5 + * 7 2 3 6-7 + * + * The heuristics is implemented as follows. + * + * The outer for_each() loop resets the 'weight' to the actual number + * of CPUs in the hop. Then inner for_each() loop decrements it by the + * number of sibling groups (cores) while assigning first set of IRQs + * to each group. IRQs 0 and 1 above are distributed this way. + * + * Now, because NUMA locality is more important, we should walk the + * same set of siblings and assign 2nd set of IRQs (2 and 3), and it's + * implemented by the medium while() loop. We do like this unless the + * number of IRQs assigned on this hop will not become equal to number + * of CPUs in the hop (weight == 0). Then we switch to the next hop and + * do the same thing. + */ + +static int irq_setup(unsigned int *irqs, unsigned int len, int node, + bool skip_first_cpu) { const struct cpumask *next, *prev = cpu_none_mask; cpumask_var_t cpus __free(free_cpumask_var); @@ -1460,11 +1516,18 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node) while (weight > 0) { cpumask_andnot(cpus, next, prev); for_each_cpu(cpu, cpus) { + cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); + --weight; + + if (unlikely(skip_first_cpu)) { + skip_first_cpu = false; + continue; + } + if (len-- == 0) goto done; + irq_set_affinity_and_hint(*irqs++, topology_sibling_cpumask(cpu)); - cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); - --weight; } } prev = next; @@ -1474,47 +1537,108 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node) return 0; } -static int mana_gd_setup_irqs(struct pci_dev *pdev) +static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec) { struct gdma_context *gc = pci_get_drvdata(pdev); - unsigned int max_queues_per_port; struct gdma_irq_context *gic; - unsigned int max_irqs, cpu; - int start_irq_index = 1; - int nvec, *irqs, irq; - int err, i = 0, j; + bool skip_first_cpu = false; + int *irqs, irq, err, i; - cpus_read_lock(); - max_queues_per_port = num_online_cpus(); - if (max_queues_per_port > MANA_MAX_NUM_QUEUES) - max_queues_per_port = MANA_MAX_NUM_QUEUES; + irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL); + if (!irqs) + return -ENOMEM; + + /* + * While processing the next pci irq vector, we start with index 1, + * as IRQ vector at index 0 is already processed for HWC. + * However, the population of irqs array starts with index 0, to be + * further used in irq_setup() + */ + for (i = 1; i <= nvec; i++) { + gic = kzalloc(sizeof(*gic), GFP_KERNEL); + if (!gic) { + err = -ENOMEM; + goto free_irq; + } + gic->handler = mana_gd_process_eq_events; + INIT_LIST_HEAD(&gic->eq_list); + spin_lock_init(&gic->lock); - /* Need 1 interrupt for the Hardware communication Channel (HWC) */ - max_irqs = max_queues_per_port + 1; + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", + i - 1, pci_name(pdev)); - nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX); - if (nvec < 0) { - cpus_read_unlock(); - return nvec; + /* one pci vector is already allocated for HWC */ + irqs[i - 1] = pci_irq_vector(pdev, i); + if (irqs[i - 1] < 0) { + err = irqs[i - 1]; + goto free_current_gic; + } + + err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic); + if (err) + goto free_current_gic; + + xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); } - if (nvec <= num_online_cpus()) - start_irq_index = 0; - irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL); - if (!irqs) { - err = -ENOMEM; - goto free_irq_vector; + /* + * When calling irq_setup() for dynamically added IRQs, if number of + * CPUs is more than or equal to allocated MSI-X, we need to skip the + * first CPU sibling group since they are already affinitized to HWC IRQ + */ + cpus_read_lock(); + if (gc->num_msix_usable <= num_online_cpus()) + skip_first_cpu = true; + + err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu); + if (err) { + cpus_read_unlock(); + goto free_irq; } - gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context), - GFP_KERNEL); - if (!gc->irq_contexts) { - err = -ENOMEM; - goto free_irq_array; + cpus_read_unlock(); + kfree(irqs); + return 0; + +free_current_gic: + kfree(gic); +free_irq: + for (i -= 1; i > 0; i--) { + irq = pci_irq_vector(pdev, i); + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; + + irq_update_affinity_hint(irq, NULL); + free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } + kfree(irqs); + return err; +} + +static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct gdma_irq_context *gic; + int *irqs, *start_irqs, irq; + unsigned int cpu; + int err, i; + + irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL); + if (!irqs) + return -ENOMEM; + + start_irqs = irqs; for (i = 0; i < nvec; i++) { - gic = &gc->irq_contexts[i]; + gic = kzalloc(sizeof(*gic), GFP_KERNEL); + if (!gic) { + err = -ENOMEM; + goto free_irq; + } + gic->handler = mana_gd_process_eq_events; INIT_LIST_HEAD(&gic->eq_list); spin_lock_init(&gic->lock); @@ -1526,69 +1650,128 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", i - 1, pci_name(pdev)); - irq = pci_irq_vector(pdev, i); - if (irq < 0) { - err = irq; - goto free_irq; + irqs[i] = pci_irq_vector(pdev, i); + if (irqs[i] < 0) { + err = irqs[i]; + goto free_current_gic; } - if (!i) { - err = request_irq(irq, mana_gd_intr, 0, gic->name, gic); - if (err) - goto free_irq; - - /* If number of IRQ is one extra than number of online CPUs, - * then we need to assign IRQ0 (hwc irq) and IRQ1 to - * same CPU. - * Else we will use different CPUs for IRQ0 and IRQ1. - * Also we are using cpumask_local_spread instead of - * cpumask_first for the node, because the node can be - * mem only. - */ - if (start_irq_index) { - cpu = cpumask_local_spread(i, gc->numa_node); - irq_set_affinity_and_hint(irq, cpumask_of(cpu)); - } else { - irqs[start_irq_index] = irq; - } - } else { - irqs[i - start_irq_index] = irq; - err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0, - gic->name, gic); - if (err) - goto free_irq; - } + err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic); + if (err) + goto free_current_gic; + + xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); } - err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node); - if (err) + /* If number of IRQ is one extra than number of online CPUs, + * then we need to assign IRQ0 (hwc irq) and IRQ1 to + * same CPU. + * Else we will use different CPUs for IRQ0 and IRQ1. + * Also we are using cpumask_local_spread instead of + * cpumask_first for the node, because the node can be + * mem only. + */ + cpus_read_lock(); + if (nvec > num_online_cpus()) { + cpu = cpumask_local_spread(0, gc->numa_node); + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu)); + irqs++; + nvec -= 1; + } + + err = irq_setup(irqs, nvec, gc->numa_node, false); + if (err) { + cpus_read_unlock(); goto free_irq; + } - gc->max_num_msix = nvec; - gc->num_msix_usable = nvec; cpus_read_unlock(); - kfree(irqs); + kfree(start_irqs); return 0; +free_current_gic: + kfree(gic); free_irq: - for (j = i - 1; j >= 0; j--) { - irq = pci_irq_vector(pdev, j); - gic = &gc->irq_contexts[j]; + for (i -= 1; i >= 0; i--) { + irq = pci_irq_vector(pdev, i); + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } - kfree(gc->irq_contexts); - gc->irq_contexts = NULL; -free_irq_array: - kfree(irqs); -free_irq_vector: - cpus_read_unlock(); - pci_free_irq_vectors(pdev); + kfree(start_irqs); return err; } +static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + unsigned int max_irqs, min_irqs; + int nvec, err; + + if (pci_msix_can_alloc_dyn(pdev)) { + max_irqs = 1; + min_irqs = 1; + } else { + /* Need 1 interrupt for HWC */ + max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1; + min_irqs = 2; + } + + nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX); + if (nvec < 0) + return nvec; + + err = mana_gd_setup_irqs(pdev, nvec); + if (err) { + pci_free_irq_vectors(pdev); + return err; + } + + gc->num_msix_usable = nvec; + gc->max_num_msix = nvec; + + return 0; +} + +static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct msi_map irq_map; + int max_irqs, i, err; + + if (!pci_msix_can_alloc_dyn(pdev)) + /* remain irqs are already allocated with HWC IRQ */ + return 0; + + /* allocate only remaining IRQs*/ + max_irqs = gc->num_msix_usable - 1; + + for (i = 1; i <= max_irqs; i++) { + irq_map = pci_msix_alloc_irq_at(pdev, i, NULL); + if (!irq_map.virq) { + err = irq_map.index; + /* caller will handle cleaning up all allocated + * irqs, after HWC is destroyed + */ + return err; + } + } + + err = mana_gd_setup_dyn_irqs(pdev, max_irqs); + if (err) + return err; + + gc->max_num_msix = gc->max_num_msix + max_irqs; + + return 0; +} + static void mana_gd_remove_irqs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); @@ -1603,19 +1786,21 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev) if (irq < 0) continue; - gic = &gc->irq_contexts[i]; + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; /* Need to clear the hint before free_irq */ irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } pci_free_irq_vectors(pdev); gc->max_num_msix = 0; gc->num_msix_usable = 0; - kfree(gc->irq_contexts); - gc->irq_contexts = NULL; } static int mana_gd_setup(struct pci_dev *pdev) @@ -1630,9 +1815,10 @@ static int mana_gd_setup(struct pci_dev *pdev) if (!gc->service_wq) return -ENOMEM; - err = mana_gd_setup_irqs(pdev); + err = mana_gd_setup_hwc_irqs(pdev); if (err) { - dev_err(gc->dev, "Failed to setup IRQs: %d\n", err); + dev_err(gc->dev, "Failed to setup IRQs for HWC creation: %d\n", + err); goto free_workqueue; } @@ -1648,6 +1834,12 @@ static int mana_gd_setup(struct pci_dev *pdev) if (err) goto destroy_hwc; + err = mana_gd_setup_remaining_irqs(pdev); + if (err) { + dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err); + goto destroy_hwc; + } + err = mana_gd_detect_devices(pdev); if (err) goto destroy_hwc; @@ -1728,6 +1920,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) gc->is_pf = mana_is_pf(pdev->device); gc->bar0_va = bar0_va; gc->dev = &pdev->dev; + xa_init(&gc->irq_contexts); if (gc->is_pf) gc->mana_pci_debugfs = debugfs_create_dir("0", mana_debugfs_root); @@ -1762,6 +1955,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) */ debugfs_remove_recursive(gc->mana_pci_debugfs); gc->mana_pci_debugfs = NULL; + xa_destroy(&gc->irq_contexts); pci_iounmap(pdev, bar0_va); free_gc: pci_set_drvdata(pdev, NULL); @@ -1787,6 +1981,8 @@ static void mana_gd_remove(struct pci_dev *pdev) gc->mana_pci_debugfs = NULL; + xa_destroy(&gc->irq_contexts); + pci_iounmap(pdev, gc->bar0_va); vfree(gc); diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index ef072e24c46d0..ada6c78a2bef4 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -881,7 +881,12 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, if (!wait_for_completion_timeout(&ctx->comp_event, (msecs_to_jiffies(hwc->hwc_timeout)))) { if (hwc->hwc_timeout != 0) - dev_err(hwc->dev, "HWC: Request timed out!\n"); + dev_err(hwc->dev, "HWC: Request timed out: %u ms\n", + hwc->hwc_timeout); + + /* Reduce further waiting if HWC no response */ + if (hwc->hwc_timeout > 1) + hwc->hwc_timeout = 1; err = -ETIMEDOUT; goto out; diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c index d30721d4516fc..7697c9b52ed34 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c +++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c @@ -174,6 +174,7 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, struct mana_port_context *apc = netdev_priv(ndev); struct bpf_prog *old_prog; struct gdma_context *gc; + int err; gc = apc->ac->gdma_dev->gdma_context; @@ -195,11 +196,45 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, */ apc->bpf_prog = prog; - if (old_prog) - bpf_prog_put(old_prog); + if (apc->port_is_up) { + /* Re-create rxq's after xdp prog was loaded or unloaded. + * Ex: re create rxq's to switch from full pages to smaller + * size page fragments when xdp prog is unloaded and + * vice-versa. + */ + + /* Pre-allocate buffers to prevent failure in mana_attach */ + err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "XDP: Insufficient memory for tx/rx re-config"); + return err; + } + + err = mana_detach(ndev, false); + if (err) { + netdev_err(ndev, + "mana_detach failed at xdp set: %d\n", err); + NL_SET_ERR_MSG_MOD(extack, + "XDP: Re-config failed at detach"); + goto err_dealloc_rxbuffs; + } + + err = mana_attach(ndev); + if (err) { + netdev_err(ndev, + "mana_attach failed at xdp set: %d\n", err); + NL_SET_ERR_MSG_MOD(extack, + "XDP: Re-config failed at attach"); + goto err_dealloc_rxbuffs; + } - if (apc->port_is_up) mana_chn_setxdp(apc, prog); + mana_pre_dealloc_rxbufs(apc); + } + + if (old_prog) + bpf_prog_put(old_prog); if (prog) ndev->max_mtu = MANA_XDP_MTU_MAX; @@ -207,6 +242,11 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog, ndev->max_mtu = gc->adapter_mtu - ETH_HLEN; return 0; + +err_dealloc_rxbuffs: + apc->bpf_prog = old_prog; + mana_pre_dealloc_rxbufs(apc); + return err; } int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 721d049c54de2..fdb32551833cd 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -55,6 +55,15 @@ static bool mana_en_need_log(struct mana_port_context *apc, int err) return true; } +static void mana_put_rx_page(struct mana_rxq *rxq, struct page *page, + bool from_pool) +{ + if (from_pool) + page_pool_put_full_page(rxq->page_pool, page, false); + else + put_page(page); +} + /* Microsoft Azure Network Adapter (MANA) functions */ static int mana_open(struct net_device *ndev) @@ -628,21 +637,40 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da) } /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */ -static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, - u32 *headroom) +static void mana_get_rxbuf_cfg(struct mana_port_context *apc, + int mtu, u32 *datasize, u32 *alloc_size, + u32 *headroom, u32 *frag_count) { - if (mtu > MANA_XDP_MTU_MAX) - *headroom = 0; /* no support for XDP */ - else - *headroom = XDP_PACKET_HEADROOM; + u32 len, buf_size; - *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); + /* Calculate datasize first (consistent across all cases) */ + *datasize = mtu + ETH_HLEN; - /* Using page pool in this case, so alloc_size is PAGE_SIZE */ - if (*alloc_size < PAGE_SIZE) - *alloc_size = PAGE_SIZE; + /* For xdp and jumbo frames make sure only one packet fits per page */ + if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) { + if (mana_xdp_get(apc)) { + *headroom = XDP_PACKET_HEADROOM; + *alloc_size = PAGE_SIZE; + } else { + *headroom = 0; /* no support for XDP */ + *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + + *headroom); + } - *datasize = mtu + ETH_HLEN; + *frag_count = 1; + return; + } + + /* Standard MTU case - optimize for multiple packets per page */ + *headroom = 0; + + /* Calculate base buffer size needed */ + len = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); + buf_size = ALIGN(len, MANA_RX_FRAG_ALIGNMENT); + + /* Calculate how many packets can fit in a page */ + *frag_count = PAGE_SIZE / buf_size; + *alloc_size = buf_size; } int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_queues) @@ -654,8 +682,9 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu, int num_qu void *va; int i; - mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize, - &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom); + mana_get_rxbuf_cfg(mpc, new_mtu, &mpc->rxbpre_datasize, + &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom, + &mpc->rxbpre_frag_count); dev = mpc->ac->gdma_dev->gdma_context->dev; @@ -731,6 +760,78 @@ static int mana_change_mtu(struct net_device *ndev, int new_mtu) return err; } +static int mana_shaper_set(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack) +{ + struct mana_port_context *apc = netdev_priv(binding->netdev); + u32 old_speed, rate; + int err; + + if (shaper->handle.scope != NET_SHAPER_SCOPE_NETDEV) { + NL_SET_ERR_MSG_MOD(extack, "net shaper scope should be netdev"); + return -EINVAL; + } + + if (apc->handle.id && shaper->handle.id != apc->handle.id) { + NL_SET_ERR_MSG_MOD(extack, "Cannot create multiple shapers"); + return -EOPNOTSUPP; + } + + if (!shaper->bw_max || (shaper->bw_max % 100000000)) { + NL_SET_ERR_MSG_MOD(extack, "Please use multiples of 100Mbps for bandwidth"); + return -EINVAL; + } + + rate = div_u64(shaper->bw_max, 1000); /* Convert bps to Kbps */ + rate = div_u64(rate, 1000); /* Convert Kbps to Mbps */ + + /* Get current speed */ + err = mana_query_link_cfg(apc); + old_speed = (err) ? SPEED_UNKNOWN : apc->speed; + + if (!err) { + err = mana_set_bw_clamp(apc, rate, TRI_STATE_TRUE); + apc->speed = (err) ? old_speed : rate; + apc->handle = (err) ? apc->handle : shaper->handle; + } + + return err; +} + +static int mana_shaper_del(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct netlink_ext_ack *extack) +{ + struct mana_port_context *apc = netdev_priv(binding->netdev); + int err; + + err = mana_set_bw_clamp(apc, 0, TRI_STATE_FALSE); + + if (!err) { + /* Reset mana port context parameters */ + apc->handle.id = 0; + apc->handle.scope = NET_SHAPER_SCOPE_UNSPEC; + apc->speed = 0; + } + + return err; +} + +static void mana_shaper_cap(struct net_shaper_binding *binding, + enum net_shaper_scope scope, + unsigned long *flags) +{ + *flags = BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MAX) | + BIT(NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS); +} + +static const struct net_shaper_ops mana_shaper_ops = { + .set = mana_shaper_set, + .delete = mana_shaper_del, + .capabilities = mana_shaper_cap, +}; + static const struct net_device_ops mana_devops = { .ndo_open = mana_open, .ndo_stop = mana_close, @@ -741,6 +842,7 @@ static const struct net_device_ops mana_devops = { .ndo_bpf = mana_bpf, .ndo_xdp_xmit = mana_xdp_xmit, .ndo_change_mtu = mana_change_mtu, + .net_shaper_ops = &mana_shaper_ops, }; static void mana_cleanup_port_context(struct mana_port_context *apc) @@ -1184,6 +1286,95 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, return err; } +int mana_query_link_cfg(struct mana_port_context *apc) +{ + struct net_device *ndev = apc->ndev; + struct mana_query_link_config_resp resp = {}; + struct mana_query_link_config_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_LINK_CONFIG, + sizeof(req), sizeof(resp)); + + req.vport = apc->port_handle; + req.hdr.resp.msg_version = GDMA_MESSAGE_V2; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + + if (err) { + if (err == -EOPNOTSUPP) { + netdev_info_once(ndev, "MANA_QUERY_LINK_CONFIG not supported\n"); + return err; + } + netdev_err(ndev, "Failed to query link config: %d\n", err); + return err; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_LINK_CONFIG, + sizeof(resp)); + + if (err || resp.hdr.status) { + netdev_err(ndev, "Failed to query link config: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = -EOPNOTSUPP; + return err; + } + + if (resp.qos_unconfigured) { + err = -EINVAL; + return err; + } + apc->speed = resp.link_speed_mbps; + apc->max_speed = resp.qos_speed_mbps; + return 0; +} + +int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, + int enable_clamping) +{ + struct mana_set_bw_clamp_resp resp = {}; + struct mana_set_bw_clamp_req req = {}; + struct net_device *ndev = apc->ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_SET_BW_CLAMP, + sizeof(req), sizeof(resp)); + req.vport = apc->port_handle; + req.link_speed_mbps = speed; + req.enable_clamping = enable_clamping; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + + if (err) { + if (err == -EOPNOTSUPP) { + netdev_info_once(ndev, "MANA_SET_BW_CLAMP not supported\n"); + return err; + } + netdev_err(ndev, "Failed to set bandwidth clamp for speed %u, err = %d", + speed, err); + return err; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_SET_BW_CLAMP, + sizeof(resp)); + + if (err || resp.hdr.status) { + netdev_err(ndev, "Failed to set bandwidth clamp: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = -EOPNOTSUPP; + return err; + } + + if (resp.qos_unconfigured) + netdev_info(ndev, "QoS is unconfigured\n"); + + return 0; +} + int mana_create_wq_obj(struct mana_port_context *apc, mana_handle_t vport, u32 wq_type, struct mana_obj_spec *wq_spec, @@ -1678,8 +1869,11 @@ static void mana_rx_skb(void *buf_va, bool from_pool, drop: if (from_pool) { - page_pool_recycle_direct(rxq->page_pool, - virt_to_head_page(buf_va)); + if (rxq->frag_count == 1) + page_pool_recycle_direct(rxq->page_pool, + virt_to_head_page(buf_va)); + else + page_pool_free_va(rxq->page_pool, buf_va, true); } else { WARN_ON_ONCE(rxq->xdp_save_va); /* Save for reuse */ @@ -1695,33 +1889,46 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev, dma_addr_t *da, bool *from_pool) { struct page *page; + u32 offset; void *va; - *from_pool = false; - /* Reuse XDP dropped page if available */ - if (rxq->xdp_save_va) { - va = rxq->xdp_save_va; - rxq->xdp_save_va = NULL; - } else { - page = page_pool_dev_alloc_pages(rxq->page_pool); - if (!page) + /* Don't use fragments for jumbo frames or XDP where it's 1 fragment + * per page. + */ + if (rxq->frag_count == 1) { + /* Reuse XDP dropped page if available */ + if (rxq->xdp_save_va) { + va = rxq->xdp_save_va; + page = virt_to_head_page(va); + rxq->xdp_save_va = NULL; + } else { + page = page_pool_dev_alloc_pages(rxq->page_pool); + if (!page) + return NULL; + + *from_pool = true; + va = page_to_virt(page); + } + + *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize, + DMA_FROM_DEVICE); + if (dma_mapping_error(dev, *da)) { + mana_put_rx_page(rxq, page, *from_pool); return NULL; + } - *from_pool = true; - va = page_to_virt(page); + return va; } - *da = dma_map_single(dev, va + rxq->headroom, rxq->datasize, - DMA_FROM_DEVICE); - if (dma_mapping_error(dev, *da)) { - if (*from_pool) - page_pool_put_full_page(rxq->page_pool, page, false); - else - put_page(virt_to_head_page(va)); - + page = page_pool_dev_alloc_frag(rxq->page_pool, &offset, + rxq->alloc_size); + if (!page) return NULL; - } + + va = page_to_virt(page) + offset; + *da = page_pool_get_dma_addr(page) + offset + rxq->headroom; + *from_pool = true; return va; } @@ -1738,9 +1945,9 @@ static void mana_refill_rx_oob(struct device *dev, struct mana_rxq *rxq, va = mana_get_rxfrag(rxq, dev, &da, &from_pool); if (!va) return; - - dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize, - DMA_FROM_DEVICE); + if (!rxoob->from_pool || rxq->frag_count == 1) + dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize, + DMA_FROM_DEVICE); *old_buf = rxoob->buf_va; *old_fp = rxoob->from_pool; @@ -2145,15 +2352,15 @@ static void mana_destroy_rxq(struct mana_port_context *apc, if (!rx_oob->buf_va) continue; - dma_unmap_single(dev, rx_oob->sgl[0].address, - rx_oob->sgl[0].size, DMA_FROM_DEVICE); - page = virt_to_head_page(rx_oob->buf_va); - if (rx_oob->from_pool) - page_pool_put_full_page(rxq->page_pool, page, false); - else - put_page(page); + if (rxq->frag_count == 1 || !rx_oob->from_pool) { + dma_unmap_single(dev, rx_oob->sgl[0].address, + rx_oob->sgl[0].size, DMA_FROM_DEVICE); + mana_put_rx_page(rxq, page, rx_oob->from_pool); + } else { + page_pool_free_va(rxq->page_pool, rx_oob->buf_va, true); + } rx_oob->buf_va = NULL; } @@ -2259,11 +2466,21 @@ static int mana_create_page_pool(struct mana_rxq *rxq, struct gdma_context *gc) struct page_pool_params pprm = {}; int ret; - pprm.pool_size = mpc->rx_queue_size; + pprm.pool_size = mpc->rx_queue_size / rxq->frag_count + 1; pprm.nid = gc->numa_node; pprm.napi = &rxq->rx_cq.napi; pprm.netdev = rxq->ndev; pprm.order = get_order(rxq->alloc_size); + pprm.dev = gc->dev; + + /* Let the page pool do the dma map when page sharing with multiple + * fragments enabled for rx buffers. + */ + if (rxq->frag_count > 1) { + pprm.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pprm.max_len = PAGE_SIZE; + pprm.dma_dir = DMA_FROM_DEVICE; + } rxq->page_pool = page_pool_create(&pprm); @@ -2302,9 +2519,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, rxq->rxq_idx = rxq_idx; rxq->rxobj = INVALID_MANA_HANDLE; - mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size, - &rxq->headroom); - + mana_get_rxbuf_cfg(apc, ndev->mtu, &rxq->datasize, &rxq->alloc_size, + &rxq->headroom, &rxq->frag_count); /* Create page pool for RX queue */ err = mana_create_page_pool(rxq, gc); if (err) { @@ -3024,6 +3240,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, goto free_indir; } + debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs, &apc->speed); + return 0; free_indir: diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index 4fb3a04994a2d..a1afa75a94631 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -495,6 +495,12 @@ static int mana_set_ringparam(struct net_device *ndev, static int mana_get_link_ksettings(struct net_device *ndev, struct ethtool_link_ksettings *cmd) { + struct mana_port_context *apc = netdev_priv(ndev); + int err; + + err = mana_query_link_cfg(apc); + cmd->base.speed = (err) ? SPEED_UNKNOWN : apc->max_speed; + cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_OTHER; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 720104661d7f2..60a4629fe6ba7 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1812,6 +1812,11 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, /* Enable NAPI handler before init callbacks */ netif_napi_add(ndev, &net_device->chan_table[0].napi, netvsc_poll); + napi_enable(&net_device->chan_table[0].napi); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, + &net_device->chan_table[0].napi); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, + &net_device->chan_table[0].napi); /* Open the channel */ device->channel->next_request_id_callback = vmbus_next_request_id; @@ -1831,12 +1836,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, /* Channel is opened */ netdev_dbg(ndev, "hv_netvsc channel opened successfully\n"); - napi_enable(&net_device->chan_table[0].napi); - netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, - &net_device->chan_table[0].napi); - netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, - &net_device->chan_table[0].napi); - /* Connect with the NetVsp */ ret = netvsc_connect_vsp(device, net_device, device_info); if (ret != 0) { @@ -1854,14 +1853,14 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, close: RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); - netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL); - netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL); - napi_disable(&net_device->chan_table[0].napi); /* Now, we can close the channel safely */ vmbus_close(device->channel); cleanup: + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL); + napi_disable(&net_device->chan_table[0].napi); netif_napi_del(&net_device->chan_table[0].napi); cleanup2: diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 9e73959e61ee0..c35f9685b6bf0 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1252,17 +1252,26 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE; + /* Enable napi before opening the vmbus channel to avoid races + * as the host placing data on the host->guest ring may be left + * out if napi was not enabled. + */ + napi_enable(&nvchan->napi); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX, + &nvchan->napi); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX, + &nvchan->napi); + ret = vmbus_open(new_sc, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, nvchan); - if (ret == 0) { - napi_enable(&nvchan->napi); - netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX, - &nvchan->napi); - netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX, - &nvchan->napi); - } else { + if (ret != 0) { netdev_notice(ndev, "sub channel open failed: %d\n", ret); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX, + NULL); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX, + NULL); + napi_disable(&nvchan->napi); } if (atomic_inc_return(&nvscdev->open_chn) == nvscdev->num_chn) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 37fac5a42587b..3312a9cb13889 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2061,6 +2061,7 @@ static struct irq_chip hv_msi_irq_chip = { static struct msi_domain_ops hv_msi_ops = { .msi_prepare = hv_msi_prepare, .msi_free = hv_msi_free, + .prepare_desc = pci_msix_prepare_desc, }; /** @@ -2082,7 +2083,7 @@ static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) hbus->msi_info.ops = &hv_msi_ops; hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | - MSI_FLAG_PCI_MSIX); + MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN); hbus->msi_info.handler = FLOW_HANDLER; hbus->msi_info.handler_name = FLOW_NAME; hbus->msi_info.data = hbus; diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index d7ba8795d60f8..43129aa6d6c75 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -222,13 +222,14 @@ static void pci_irq_unmask_msix(struct irq_data *data) pci_msix_unmask(irq_data_get_msi_desc(data)); } -static void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg, - struct msi_desc *desc) +void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg, + struct msi_desc *desc) { /* Don't fiddle with preallocated MSI descriptors */ if (!desc->pci.mask_base) msix_prepare_msi_desc(to_pci_dev(desc->dev), desc); } +EXPORT_SYMBOL_GPL(pci_msix_prepare_desc); static const struct msi_domain_template pci_msix_template = { .chip = { diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index f772753a391b4..bb4f9e75f1863 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1407,14 +1407,19 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, } /* - * Our channel array is sparsley populated and we + * Our channel array could be sparsley populated and we * initiated I/O on a processor/hw-q that does not * currently have a designated channel. Fix this. * The strategy is simple: - * I. Ensure NUMA locality - * II. Distribute evenly (best effort) + * I. Prefer the channel associated with the current CPU + * II. Ensure NUMA locality + * III. Distribute evenly (best effort) */ + /* Prefer the channel on the I/O issuing processor/hw-q */ + if (cpumask_test_cpu(q_num, &stor_device->alloced_cpus)) + return stor_device->stor_chns[q_num]; + node_mask = cpumask_of_node(cpu_to_node(q_num)); num_channels = 0; @@ -1470,59 +1475,48 @@ static int storvsc_do_io(struct hv_device *device, /* See storvsc_change_target_cpu(). */ outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); if (outgoing_channel != NULL) { - if (outgoing_channel->target_cpu == q_num) { - /* - * Ideally, we want to pick a different channel if - * available on the same NUMA node. - */ - node_mask = cpumask_of_node(cpu_to_node(q_num)); - for_each_cpu_wrap(tgt_cpu, - &stor_device->alloced_cpus, q_num + 1) { - if (!cpumask_test_cpu(tgt_cpu, node_mask)) - continue; - if (tgt_cpu == q_num) - continue; - channel = READ_ONCE( - stor_device->stor_chns[tgt_cpu]); - if (channel == NULL) - continue; - if (hv_get_avail_to_write_percent( - &channel->outbound) - > ring_avail_percent_lowater) { - outgoing_channel = channel; - goto found_channel; - } - } + if (hv_get_avail_to_write_percent(&outgoing_channel->outbound) + > ring_avail_percent_lowater) + goto found_channel; - /* - * All the other channels on the same NUMA node are - * busy. Try to use the channel on the current CPU - */ - if (hv_get_avail_to_write_percent( - &outgoing_channel->outbound) - > ring_avail_percent_lowater) + /* + * Channel is busy, try to find a channel on the same NUMA node + */ + node_mask = cpumask_of_node(cpu_to_node(q_num)); + for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus, + q_num + 1) { + if (!cpumask_test_cpu(tgt_cpu, node_mask)) + continue; + channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]); + if (!channel) + continue; + if (hv_get_avail_to_write_percent(&channel->outbound) + > ring_avail_percent_lowater) { + outgoing_channel = channel; goto found_channel; + } + } - /* - * If we reach here, all the channels on the current - * NUMA node are busy. Try to find a channel in - * other NUMA nodes - */ - for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { - if (cpumask_test_cpu(tgt_cpu, node_mask)) - continue; - channel = READ_ONCE( - stor_device->stor_chns[tgt_cpu]); - if (channel == NULL) - continue; - if (hv_get_avail_to_write_percent( - &channel->outbound) - > ring_avail_percent_lowater) { - outgoing_channel = channel; - goto found_channel; - } + /* + * If we reach here, all the channels on the current + * NUMA node are busy. Try to find a channel in + * all NUMA nodes + */ + for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus, + q_num + 1) { + channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]); + if (!channel) + continue; + if (hv_get_avail_to_write_percent(&channel->outbound) + > ring_avail_percent_lowater) { + outgoing_channel = channel; + goto found_channel; } } + /* + * If we reach here, all the channels are busy. Use the + * original channel found. + */ } else { spin_lock_irqsave(&stor_device->lock, flags); outgoing_channel = stor_device->stor_chns[q_num]; diff --git a/fs/dcache.c b/fs/dcache.c index cd7c019f92f9b..17541c28ae9d0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1032,6 +1032,15 @@ struct dentry *d_find_alias_rcu(struct inode *inode) return de; } +void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) +{ + spin_lock(&dentry->d_lock); + if (!dentry->d_lockref.count) + to_shrink_list(dentry, dispose); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_dispose_if_unused); + /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. @@ -1042,12 +1051,8 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { - spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) - to_shrink_list(dentry, &dispose); - spin_unlock(&dentry->d_lock); - } + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) + d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } @@ -1087,6 +1092,7 @@ void shrink_dentry_list(struct list_head *list) shrink_kill(dentry); } } +EXPORT_SYMBOL(shrink_dentry_list); static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1f64ae6d7a69e..a9de694d48fcd 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -21,6 +21,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -1887,6 +1888,19 @@ static int fuse_notify_resend(struct fuse_conn *fc) return 0; } +/* + * Increments the fuse connection epoch. This will result of dentries from + * previous epochs to be invalidated. + * + * XXX optimization: add call to shrink_dcache_sb()? + */ +static int fuse_notify_inc_epoch(struct fuse_conn *fc) +{ + atomic_inc(&fc->epoch); + + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1915,6 +1929,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_RESEND: return fuse_notify_resend(fc); + case FUSE_NOTIFY_INC_EPOCH: + return fuse_notify_inc_epoch(fc); + default: fuse_copy_finish(cs); return -EINVAL; @@ -1989,7 +2006,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); - goto out; + goto copy_finish; } err = -EINVAL; @@ -2453,6 +2470,17 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, } } +#ifdef CONFIG_PROC_FS +static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) +{ + struct fuse_dev *fud = fuse_get_dev(file); + if (!fud) + return; + + seq_printf(seq, "fuse_connection:\t%u\n", fud->fc->dev); +} +#endif + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, @@ -2465,6 +2493,9 @@ const struct file_operations fuse_dev_operations = { .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, +#ifdef CONFIG_PROC_FS + .show_fdinfo = fuse_dev_show_fdinfo, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 54104dd48af7c..f2fadaf084843 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -197,9 +197,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) struct inode *inode; struct dentry *parent; struct fuse_mount *fm; + struct fuse_conn *fc; struct fuse_inode *fi; int ret; + fc = get_fuse_conn_super(entry->d_sb); + if (entry->d_time < atomic_read(&fc->epoch)) + goto invalid; + inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; @@ -415,16 +420,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { - int err; struct fuse_entry_out outarg; + struct fuse_conn *fc; struct inode *inode; struct dentry *newent; + int err, epoch; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + fc = get_fuse_conn_super(dir->i_sb); + epoch = atomic_read(&fc->epoch); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -446,6 +455,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, goto out_err; entry = newent ? newent : entry; + entry->d_time = epoch; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else @@ -618,7 +628,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, umode_t mode, u32 opcode) { - int err; struct inode *inode; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); @@ -628,11 +637,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + int epoch, err; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); + epoch = atomic_read(&fm->fc->epoch); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) @@ -701,6 +712,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, } kfree(forget); d_instantiate(entry, inode); + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); err = generic_file_open(inode, file); @@ -787,12 +799,14 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; - int err; struct fuse_forget_link *forget; + int epoch, err; if (fuse_is_bad(dir)) return -EIO; + epoch = atomic_read(&fm->fc->epoch); + forget = fuse_alloc_forget(); if (!forget) return -ENOMEM; @@ -835,9 +849,11 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, return PTR_ERR(d); if (d) { + d->d_time = epoch; fuse_change_entry_timeout(d, &outarg); dput(d); } else { + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outarg); } fuse_dir_changed(dir); @@ -1632,7 +1648,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; if (fc->cache_symlinks) - return page_get_link(dentry, inode, callback); + return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) @@ -1680,6 +1696,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file) */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e6cc3d552b138..6873c5f401a27 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -604,6 +604,9 @@ struct fuse_conn { /** Number of fuse_dev's */ atomic_t dev_count; + /** Current epoch for up-to-date dentries */ + atomic_t epoch; + struct rcu_head rcu; /** The user id for this mount */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd3321e29a3e5..8802111e0b478 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -926,6 +926,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); + atomic_set(&fc->epoch, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 0377b6dc24c80..1987cdbeeb075 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file, struct fuse_conn *fc; struct inode *inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + int epoch; if (!o->nodeid) { /* @@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file, return -EIO; fc = get_fuse_conn(dir); + epoch = atomic_read(&fc->epoch); name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); @@ -256,6 +258,7 @@ static int fuse_direntplus_link(struct file *file, } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + dentry->d_time = epoch; fuse_change_entry_timeout(dentry, o); dput(dentry); diff --git a/fs/namei.c b/fs/namei.c index 999e3dfe32582..65cfdefa535f7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5334,10 +5334,9 @@ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ -const char *page_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *callback) +static char *__page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; @@ -5356,8 +5355,23 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode, } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); - kaddr = page_address(page); - nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); + return page_address(page); +} + +const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + return __page_get_link(dentry, inode, callback); +} +EXPORT_SYMBOL_GPL(page_get_link_raw); + +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + char *kaddr = __page_get_link(dentry, inode, callback); + + if (!IS_ERR(kaddr)) + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 69782344e7670..d0d64d13cdbc3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -256,6 +256,8 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); +extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); diff --git a/include/linux/fs.h b/include/linux/fs.h index dddb4c98e2368..b39fc1f0a70eb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3334,6 +3334,8 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); +extern const char *page_get_link_raw(struct dentry *, struct inode *, + struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 8ab57274a21ec..9a32980716200 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -37,6 +37,7 @@ enum io_uring_cmd_flags { /* set when uring wants to cancel a previously issued command */ IO_URING_F_CANCEL = (1 << 11), IO_URING_F_COMPAT = (1 << 12), + IO_URING_F_TASK_DEAD = (1 << 13), }; struct io_wq_work_node { diff --git a/include/linux/msi.h b/include/linux/msi.h index 8d97b890faeca..b3d17b7ebdbec 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -679,6 +679,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, struct irq_domain *parent); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); +void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg, + struct msi_desc *desc); #else /* CONFIG_PCI_MSI */ static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) { diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 79516db61bcae..6c300fd325237 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -390,7 +390,7 @@ struct gdma_context { unsigned int max_num_queues; unsigned int max_num_msix; unsigned int num_msix_usable; - struct gdma_irq_context *irq_contexts; + struct xarray irq_contexts; /* L2 MTU */ u16 adapter_mtu; @@ -582,18 +582,23 @@ enum { /* Driver can handle holes (zeros) in the device list */ #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) +/* Driver supports dynamic MSI-X vector allocation */ +#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13) + /* Driver can self reset on EQE notification */ #define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14) /* Driver can self reset on FPGA Reconfig EQE notification */ #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ + GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE) diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 4176edf1be719..0921485565c05 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -5,6 +5,7 @@ #define _MANA_H #include +#include #include "gdma.h" #include "hw_channel.h" @@ -64,6 +65,8 @@ enum TRI_STATE { #define MANA_STATS_RX_COUNT 5 #define MANA_STATS_TX_COUNT 11 +#define MANA_RX_FRAG_ALIGNMENT 64 + struct mana_stats_rx { u64 packets; u64 bytes; @@ -327,6 +330,7 @@ struct mana_rxq { u32 datasize; u32 alloc_size; u32 headroom; + u32 frag_count; mana_handle_t rxobj; @@ -509,6 +513,7 @@ struct mana_port_context { u32 rxbpre_datasize; u32 rxbpre_alloc_size; u32 rxbpre_headroom; + u32 rxbpre_frag_count; struct bpf_prog *bpf_prog; @@ -526,7 +531,14 @@ struct mana_port_context { struct mutex vport_mutex; int vport_use_count; + /* Net shaper handle*/ + struct net_shaper_handle handle; + u16 port_idx; + /* Currently configured speed (mbps) */ + u32 speed; + /* Maximum speed supported by the SKU (mbps) */ + u32 max_speed; bool port_is_up; bool port_st_save; /* Saved port state */ @@ -562,6 +574,9 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); void mana_query_gf_stats(struct mana_port_context *apc); +int mana_query_link_cfg(struct mana_port_context *apc); +int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, + int enable_clamping); void mana_query_phy_stats(struct mana_port_context *apc); int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); @@ -589,6 +604,8 @@ enum mana_command_code { MANA_FENCE_RQ = 0x20006, MANA_CONFIG_VPORT_RX = 0x20007, MANA_QUERY_VPORT_CONFIG = 0x20008, + MANA_QUERY_LINK_CONFIG = 0x2000A, + MANA_SET_BW_CLAMP = 0x2000B, MANA_QUERY_PHY_STAT = 0x2000c, /* Privileged commands for the PF mode */ @@ -598,6 +615,35 @@ enum mana_command_code { MANA_DEREGISTER_HW_PORT = 0x28004, }; +/* Query Link Configuration*/ +struct mana_query_link_config_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; +}; /* HW DATA */ + +struct mana_query_link_config_resp { + struct gdma_resp_hdr hdr; + u32 qos_speed_mbps; + u8 qos_unconfigured; + u8 reserved1[3]; + u32 link_speed_mbps; + u8 reserved2[4]; +}; /* HW DATA */ + +/* Set Bandwidth Clamp*/ +struct mana_set_bw_clamp_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + enum TRI_STATE enable_clamping; + u32 link_speed_mbps; +}; /* HW DATA */ + +struct mana_set_bw_clamp_resp { + struct gdma_resp_hdr hdr; + u8 qos_unconfigured; + u8 reserved[7]; +}; /* HW DATA */ + /* Query Device Configuration */ struct mana_query_device_cfg_req { struct gdma_req_hdr hdr; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index f1e99458e29e4..a022b3707ad6d 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -220,6 +220,9 @@ * * 7.41 * - add FUSE_ALLOW_IDMAP + * + * 7.44 + * - add FUSE_NOTIFY_INC_EPOCH */ #ifndef _LINUX_FUSE_H @@ -255,7 +258,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 41 +#define FUSE_KERNEL_MINOR_VERSION 44 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -655,6 +658,7 @@ enum fuse_notify_code { FUSE_NOTIFY_RETRIEVE = 5, FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, + FUSE_NOTIFY_INC_EPOCH = 8, FUSE_NOTIFY_CODE_MAX, }; diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 6259292259f32..cc2b9ea2641da 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + unsigned int flags = IO_URING_F_COMPLETE_DEFER; + + if (current->flags & (PF_EXITING | PF_KTHREAD)) + flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ - ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); + ioucmd->task_work_cb(ioucmd, flags); } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index ae57bf69ad4af..902513e36a36d 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -83,6 +83,7 @@ enum { }; static int in_hand_shake; +static int debug; static char *os_name = ""; static char *os_major = ""; @@ -183,6 +184,20 @@ static void kvp_update_file(int pool) kvp_release_lock(pool); } +static void kvp_dump_initial_pools(int pool) +{ + int i; + + syslog(LOG_DEBUG, "===Start dumping the contents of pool %d ===\n", + pool); + + for (i = 0; i < kvp_file_info[pool].num_records; i++) + syslog(LOG_DEBUG, "pool: %d, %d/%d key=%s val=%s\n", + pool, i + 1, kvp_file_info[pool].num_records, + kvp_file_info[pool].records[i].key, + kvp_file_info[pool].records[i].value); +} + static void kvp_update_mem_state(int pool) { FILE *filep; @@ -270,6 +285,8 @@ static int kvp_file_init(void) return 1; kvp_file_info[i].num_records = 0; kvp_update_mem_state(i); + if (debug) + kvp_dump_initial_pools(i); } return 0; @@ -297,6 +314,9 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size) * Found a match; just move the remaining * entries up. */ + if (debug) + syslog(LOG_DEBUG, "%s: deleting the KVP: pool=%d key=%s val=%s", + __func__, pool, record[i].key, record[i].value); if (i == (num_records - 1)) { kvp_file_info[pool].num_records--; kvp_update_file(pool); @@ -315,20 +335,36 @@ static int kvp_key_delete(int pool, const __u8 *key, int key_size) kvp_update_file(pool); return 0; } + + if (debug) + syslog(LOG_DEBUG, "%s: could not delete KVP: pool=%d key=%s. Record not found", + __func__, pool, key); + return 1; } static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, const __u8 *value, int value_size) { - int i; - int num_records; struct kvp_record *record; + int num_records; int num_blocks; + int i; + + if (debug) + syslog(LOG_DEBUG, "%s: got a KVP: pool=%d key=%s val=%s", + __func__, pool, key, value); if ((key_size > HV_KVP_EXCHANGE_MAX_KEY_SIZE) || - (value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE)) + (value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE)) { + syslog(LOG_ERR, "%s: Too long key or value: key=%s, val=%s", + __func__, key, value); + + if (debug) + syslog(LOG_DEBUG, "%s: Too long key or value: pool=%d, key=%s, val=%s", + __func__, pool, key, value); return 1; + } /* * First update the in-memory state. @@ -348,6 +384,9 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, */ memcpy(record[i].value, value, value_size); kvp_update_file(pool); + if (debug) + syslog(LOG_DEBUG, "%s: updated: pool=%d key=%s val=%s", + __func__, pool, key, value); return 0; } @@ -359,8 +398,10 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, record = realloc(record, sizeof(struct kvp_record) * ENTRIES_PER_BLOCK * (num_blocks + 1)); - if (record == NULL) + if (!record) { + syslog(LOG_ERR, "%s: Memory alloc failure", __func__); return 1; + } kvp_file_info[pool].num_blocks++; } @@ -368,6 +409,11 @@ static int kvp_key_add_or_modify(int pool, const __u8 *key, int key_size, memcpy(record[i].key, key, key_size); kvp_file_info[pool].records = record; kvp_file_info[pool].num_records++; + + if (debug) + syslog(LOG_DEBUG, "%s: added: pool=%d key=%s val=%s", + __func__, pool, key, value); + kvp_update_file(pool); return 0; } @@ -1661,6 +1707,7 @@ void print_usage(char *argv[]) fprintf(stderr, "Usage: %s [options]\n" "Options are:\n" " -n, --no-daemon stay in foreground, don't daemonize\n" + " -d, --debug Enable debug logs(syslog debug by default)\n" " -h, --help print this help\n", argv[0]); } @@ -1682,10 +1729,11 @@ int main(int argc, char *argv[]) static struct option long_options[] = { {"help", no_argument, 0, 'h' }, {"no-daemon", no_argument, 0, 'n' }, + {"debug", no_argument, 0, 'd' }, {0, 0, 0, 0 } }; - while ((opt = getopt_long(argc, argv, "hn", long_options, + while ((opt = getopt_long(argc, argv, "hnd", long_options, &long_index)) != -1) { switch (opt) { case 'n': @@ -1694,6 +1742,9 @@ int main(int argc, char *argv[]) case 'h': print_usage(argv); exit(0); + case 'd': + debug = 1; + break; default: print_usage(argv); exit(EXIT_FAILURE); @@ -1716,6 +1767,9 @@ int main(int argc, char *argv[]) */ kvp_get_domain_name(full_domain_name, sizeof(full_domain_name)); + if (debug) + syslog(LOG_INFO, "Logging debug info in syslog(debug)"); + if (kvp_file_init()) { syslog(LOG_ERR, "Failed to initialize the pools"); exit(EXIT_FAILURE);