From cb83b559bea39f207ee214ee2972657e8576ed18 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:18 -0700 Subject: idpf: add support for Tx refillqs in flow scheduling mode In certain production environments, it is possible for completion tags to collide, meaning N packets with the same completion tag are in flight at the same time. In this environment, any given Tx queue is effectively used to send both slower traffic and higher throughput traffic simultaneously. This is the result of a customer's specific configuration in the device pipeline, the details of which Intel cannot provide. This configuration results in a small number of out-of-order completions, i.e., a small number of packets in flight. The existing guardrails in the driver only protect against a large number of packets in flight. The slower flow completions are delayed which causes the out-of-order completions. The fast flow will continue sending traffic and generating tags. Because tags are generated on the fly, the fast flow eventually uses the same tag for a packet that is still in flight from the slower flow. The driver has no idea which packet it should clean when it processes the completion with that tag, but it will look for the packet on the buffer ring before the hash table. If the slower flow packet completion is processed first, it will end up cleaning the fast flow packet on the ring prematurely. This leaves the descriptor ring in a bad state resulting in a crash or Tx timeout. In summary, generating a tag when a packet is sent can lead to the same tag being associated with multiple packets. This can lead to resource leaks, crashes, and/or Tx timeouts. Before we can replace the tag generation, we need a new mechanism for the send path to know what tag to use next. The driver will allocate and initialize a refillq for each TxQ with all of the possible free tag values. During send, the driver grabs the next free tag from the refillq from next_to_clean. While cleaning the packet, the clean routine posts the tag back to the refillq's next_to_use to indicate that it is now free to use. This mechanism works exactly the same way as the existing Rx refill queues, which post the cleaned buffer IDs back to the buffer queue to be reposted to HW. Since we're using the refillqs for both Rx and Tx now, genericize some of the existing refillq support. Note: the refillqs will not be used yet. This is only demonstrating how they will be used to pass free tags back to the send path. Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 93 ++++++++++++++++++++++++++--- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 8 ++- 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 66a1b040639d..9b63944235fb 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -139,6 +139,9 @@ static void idpf_tx_desc_rel(struct idpf_tx_queue *txq) if (!txq->desc_ring) return; + if (txq->refillq) + kfree(txq->refillq->ring); + dmam_free_coherent(txq->dev, txq->size, txq->desc_ring, txq->dma); txq->desc_ring = NULL; txq->next_to_use = 0; @@ -244,6 +247,7 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, struct idpf_tx_queue *tx_q) { struct device *dev = tx_q->dev; + struct idpf_sw_queue *refillq; int err; err = idpf_tx_buf_alloc_all(tx_q); @@ -267,6 +271,29 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, tx_q->next_to_clean = 0; idpf_queue_set(GEN_CHK, tx_q); + if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) + return 0; + + refillq = tx_q->refillq; + refillq->desc_count = tx_q->desc_count; + refillq->ring = kcalloc(refillq->desc_count, sizeof(u32), + GFP_KERNEL); + if (!refillq->ring) { + err = -ENOMEM; + goto err_alloc; + } + + for (unsigned int i = 0; i < refillq->desc_count; i++) + refillq->ring[i] = + FIELD_PREP(IDPF_RFL_BI_BUFID_M, i) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, + idpf_queue_has(GEN_CHK, refillq)); + + /* Go ahead and flip the GEN bit since this counts as filling + * up the ring, i.e. we already ring wrapped. + */ + idpf_queue_change(GEN_CHK, refillq); + return 0; err_alloc: @@ -603,18 +630,18 @@ static int idpf_rx_hdr_buf_alloc_all(struct idpf_buf_queue *bufq) } /** - * idpf_rx_post_buf_refill - Post buffer id to refill queue + * idpf_post_buf_refill - Post buffer id to refill queue * @refillq: refill queue to post to * @buf_id: buffer id to post */ -static void idpf_rx_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) +static void idpf_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) { u32 nta = refillq->next_to_use; /* store the buffer ID and the SW maintained GEN bit to the refillq */ refillq->ring[nta] = - FIELD_PREP(IDPF_RX_BI_BUFID_M, buf_id) | - FIELD_PREP(IDPF_RX_BI_GEN_M, + FIELD_PREP(IDPF_RFL_BI_BUFID_M, buf_id) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, idpf_queue_has(GEN_CHK, refillq)); if (unlikely(++nta == refillq->desc_count)) { @@ -995,6 +1022,11 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) struct idpf_txq_group *txq_grp = &vport->txq_grps[i]; for (j = 0; j < txq_grp->num_txq; j++) { + if (flow_sch_en) { + kfree(txq_grp->txqs[j]->refillq); + txq_grp->txqs[j]->refillq = NULL; + } + kfree(txq_grp->txqs[j]); txq_grp->txqs[j] = NULL; } @@ -1414,6 +1446,13 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) } idpf_queue_set(FLOW_SCH_EN, q); + + q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL); + if (!q->refillq) + goto err_alloc; + + idpf_queue_set(GEN_CHK, q->refillq); + idpf_queue_set(RFL_GEN_CHK, q->refillq); } if (!split) @@ -2005,6 +2044,8 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); + idpf_post_buf_refill(txq->refillq, compl_tag); + /* If we didn't clean anything on the ring, this packet must be * in the hash table. Go clean it there. */ @@ -2364,6 +2405,37 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) return ntu; } +/** + * idpf_tx_get_free_buf_id - get a free buffer ID from the refill queue + * @refillq: refill queue to get buffer ID from + * @buf_id: return buffer ID + * + * Return: true if a buffer ID was found, false if not + */ +static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, + u16 *buf_id) +{ + u32 ntc = refillq->next_to_clean; + u32 refill_desc; + + refill_desc = refillq->ring[ntc]; + + if (unlikely(idpf_queue_has(RFL_GEN_CHK, refillq) != + !!(refill_desc & IDPF_RFL_BI_GEN_M))) + return false; + + *buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); + + if (unlikely(++ntc == refillq->desc_count)) { + idpf_queue_change(RFL_GEN_CHK, refillq); + ntc = 0; + } + + refillq->next_to_clean = ntc; + + return true; +} + /** * idpf_tx_splitq_map - Build the Tx flex descriptor * @tx_q: queue to send buffer on @@ -2912,6 +2984,13 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, } if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &tx_params.compl_tag))) { + u64_stats_update_begin(&tx_q->stats_sync); + u64_stats_inc(&tx_q->q_stats.q_busy); + u64_stats_update_end(&tx_q->stats_sync); + } + tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; /* Set the RE bit to catch any packets that may have not been @@ -3472,7 +3551,7 @@ payload: skip_data: rx_buf->netmem = 0; - idpf_rx_post_buf_refill(refillq, buf_id); + idpf_post_buf_refill(refillq, buf_id); IDPF_RX_BUMP_NTC(rxq, ntc); /* skip if it is non EOP desc */ @@ -3580,10 +3659,10 @@ static void idpf_rx_clean_refillq(struct idpf_buf_queue *bufq, bool failure; if (idpf_queue_has(RFL_GEN_CHK, refillq) != - !!(refill_desc & IDPF_RX_BI_GEN_M)) + !!(refill_desc & IDPF_RFL_BI_GEN_M)) break; - buf_id = FIELD_GET(IDPF_RX_BI_BUFID_M, refill_desc); + buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); failure = idpf_rx_update_bufq_desc(bufq, buf_id, buf_desc); if (failure) break; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 281de655a813..58232a1bd0a9 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -108,8 +108,8 @@ do { \ */ #define IDPF_TX_SPLITQ_RE_MIN_GAP 64 -#define IDPF_RX_BI_GEN_M BIT(16) -#define IDPF_RX_BI_BUFID_M GENMASK(15, 0) +#define IDPF_RFL_BI_GEN_M BIT(16) +#define IDPF_RFL_BI_BUFID_M GENMASK(15, 0) #define IDPF_RXD_EOF_SPLITQ VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_EOF_M #define IDPF_RXD_EOF_SINGLEQ VIRTCHNL2_RX_BASE_DESC_STATUS_EOF_M @@ -622,6 +622,7 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64, * @cleaned_pkts: Number of packets cleaned for the above said case * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @stash: Tx buffer stash for Flow-based scheduling mode + * @refillq: Pointer to refill queue * @compl_tag_bufid_m: Completion tag buffer id mask * @compl_tag_cur_gen: Used to keep track of current completion tag generation * @compl_tag_gen_max: To determine when compl_tag_cur_gen should be reset @@ -671,6 +672,7 @@ struct idpf_tx_queue { u16 tx_max_bufs; struct idpf_txq_stash *stash; + struct idpf_sw_queue *refillq; u16 compl_tag_bufid_m; u16 compl_tag_cur_gen; @@ -692,7 +694,7 @@ struct idpf_tx_queue { __cacheline_group_end_aligned(cold); }; libeth_cacheline_set_assert(struct idpf_tx_queue, 64, - 112 + sizeof(struct u64_stats_sync), + 120 + sizeof(struct u64_stats_sync), 24); /** -- cgit v1.2.3 From f2d18e16479cac7a708d77cbfb4220a9114a71fc Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:19 -0700 Subject: idpf: improve when to set RE bit logic Track the gap between next_to_use and the last RE index. Set RE again if the gap is large enough to ensure RE bit is set frequently. This is critical before removing the stashing mechanisms because the opportunistic descriptor ring cleaning from the out-of-order completions will go away. Previously the descriptors would be "cleaned" by both the descriptor (RE) completion and the out-of-order completions. Without the latter, we must ensure the RE bit is set more frequently. Otherwise, it's theoretically possible for the descriptor ring next_to_clean to never advance. The previous implementation was dependent on the start of a packet falling on a 64th index in the descriptor ring, which is not guaranteed with large packets. Signed-off-by: Luigi Rizzo Signed-off-by: Brian Vazquez Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 20 +++++++++++++++++++- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 6 ++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 9b63944235fb..ee59153508af 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -294,6 +294,8 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, */ idpf_queue_change(GEN_CHK, refillq); + tx_q->last_re = tx_q->desc_count - IDPF_TX_SPLITQ_RE_MIN_GAP; + return 0; err_alloc: @@ -2912,6 +2914,21 @@ static void idpf_tx_set_tstamp_desc(union idpf_flex_tx_ctx_desc *ctx_desc, { } #endif /* CONFIG_PTP_1588_CLOCK */ +/** + * idpf_tx_splitq_need_re - check whether RE bit needs to be set + * @tx_q: pointer to Tx queue + * + * Return: true if RE bit needs to be set, false otherwise + */ +static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q) +{ + int gap = tx_q->next_to_use - tx_q->last_re; + + gap += (gap < 0) ? tx_q->desc_count : 0; + + return gap >= IDPF_TX_SPLITQ_RE_MIN_GAP; +} + /** * idpf_tx_splitq_frame - Sends buffer on Tx ring using flex descriptors * @skb: send buffer @@ -2998,9 +3015,10 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, * MIN_RING size to ensure it will be set at least once each * time around the ring. */ - if (!(tx_q->next_to_use % IDPF_TX_SPLITQ_RE_MIN_GAP)) { + if (idpf_tx_splitq_need_re(tx_q)) { tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE; tx_q->txq_grp->num_completions_pending++; + tx_q->last_re = tx_q->next_to_use; } if (skb->ip_summed == CHECKSUM_PARTIAL) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 58232a1bd0a9..c75ca5d3e57c 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -610,6 +610,8 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64, * @netdev: &net_device corresponding to this queue * @next_to_use: Next descriptor to use * @next_to_clean: Next descriptor to clean + * @last_re: last descriptor index that RE bit was set + * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @cleaned_bytes: Splitq only, TXQ only: When a TX completion is received on * the TX completion queue, it can be for any TXQ associated * with that completion queue. This means we can clean up to @@ -620,7 +622,6 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64, * only once at the end of the cleaning routine. * @clean_budget: singleq only, queue cleaning budget * @cleaned_pkts: Number of packets cleaned for the above said case - * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @stash: Tx buffer stash for Flow-based scheduling mode * @refillq: Pointer to refill queue * @compl_tag_bufid_m: Completion tag buffer id mask @@ -663,6 +664,8 @@ struct idpf_tx_queue { __cacheline_group_begin_aligned(read_write); u16 next_to_use; u16 next_to_clean; + u16 last_re; + u16 tx_max_bufs; union { u32 cleaned_bytes; @@ -670,7 +673,6 @@ struct idpf_tx_queue { }; u16 cleaned_pkts; - u16 tx_max_bufs; struct idpf_txq_stash *stash; struct idpf_sw_queue *refillq; -- cgit v1.2.3 From b61dfa9bc4430ad82b96d3a7c1c485350f91b467 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:20 -0700 Subject: idpf: simplify and fix splitq Tx packet rollback error path Move (and rename) the existing rollback logic to singleq.c since that will be the only consumer. Create a simplified splitq specific rollback function to loop through and unmap tx_bufs based on the completion tag. This is critical before replacing the Tx buffer ring with the buffer pool since the previous rollback indexing will not work to unmap the chained buffers from the pool. Cache the next_to_use index before any portion of the packet is put on the descriptor ring. In case of an error, the rollback will bump tail to the correct next_to_use value. Because the splitq path now supports different types of context descriptors (and potentially multiple in the future), this will take care of rolling back any and all context descriptors encoded on the ring for the erroneous packet. The previous rollback logic was broken for PTP packets since it would not account for the PTP context descriptor. Fixes: 1a49cf814fe1 ("idpf: add Tx timestamp flows") Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/idpf/idpf_singleq_txrx.c | 57 +++++++++++++- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 91 +++++++++------------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 5 +- 3 files changed, 95 insertions(+), 58 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 555879b1248d..57c0f5ab8f9e 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -179,6 +179,58 @@ static int idpf_tx_singleq_csum(struct sk_buff *skb, return 1; } +/** + * idpf_tx_singleq_dma_map_error - handle TX DMA map errors + * @txq: queue to send buffer on + * @skb: send buffer + * @first: original first buffer info buffer for packet + * @idx: starting point on ring to unwind + */ +static void idpf_tx_singleq_dma_map_error(struct idpf_tx_queue *txq, + struct sk_buff *skb, + struct idpf_tx_buf *first, u16 idx) +{ + struct libeth_sq_napi_stats ss = { }; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + /* clear dma mappings for failed tx_buf map */ + for (;;) { + struct idpf_tx_buf *tx_buf; + + tx_buf = &txq->tx_buf[idx]; + libeth_tx_complete(tx_buf, &cp); + if (tx_buf == first) + break; + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + if (skb_is_gso(skb)) { + union idpf_tx_flex_desc *tx_desc; + + /* If we failed a DMA mapping for a TSO packet, we will have + * used one additional descriptor for a context + * descriptor. Reset that here. + */ + tx_desc = &txq->flex_tx[idx]; + memset(tx_desc, 0, sizeof(*tx_desc)); + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + /* Update tail in case netdev_xmit_more was previously true */ + idpf_tx_buf_hw_update(txq, idx, false); +} + /** * idpf_tx_singleq_map - Build the Tx base descriptor * @tx_q: queue to send buffer on @@ -219,8 +271,9 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) + return idpf_tx_singleq_dma_map_error(tx_q, skb, + first, i); /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index ee59153508af..527d56bcbbef 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2339,57 +2339,6 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, return count; } -/** - * idpf_tx_dma_map_error - handle TX DMA map errors - * @txq: queue to send buffer on - * @skb: send buffer - * @first: original first buffer info buffer for packet - * @idx: starting point on ring to unwind - */ -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 idx) -{ - struct libeth_sq_napi_stats ss = { }; - struct libeth_cq_pp cp = { - .dev = txq->dev, - .ss = &ss, - }; - - u64_stats_update_begin(&txq->stats_sync); - u64_stats_inc(&txq->q_stats.dma_map_errs); - u64_stats_update_end(&txq->stats_sync); - - /* clear dma mappings for failed tx_buf map */ - for (;;) { - struct idpf_tx_buf *tx_buf; - - tx_buf = &txq->tx_buf[idx]; - libeth_tx_complete(tx_buf, &cp); - if (tx_buf == first) - break; - if (idx == 0) - idx = txq->desc_count; - idx--; - } - - if (skb_is_gso(skb)) { - union idpf_tx_flex_desc *tx_desc; - - /* If we failed a DMA mapping for a TSO packet, we will have - * used one additional descriptor for a context - * descriptor. Reset that here. - */ - tx_desc = &txq->flex_tx[idx]; - memset(tx_desc, 0, sizeof(*tx_desc)); - if (idx == 0) - idx = txq->desc_count; - idx--; - } - - /* Update tail in case netdev_xmit_more was previously true */ - idpf_tx_buf_hw_update(txq, idx, false); -} - /** * idpf_tx_splitq_bump_ntu - adjust NTU and generation * @txq: the tx ring to wrap @@ -2438,6 +2387,37 @@ static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, return true; } +/** + * idpf_tx_splitq_pkt_err_unmap - Unmap buffers and bump tail in case of error + * @txq: Tx queue to unwind + * @params: pointer to splitq params struct + * @first: starting buffer for packet to unmap + */ +static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq, + struct idpf_tx_splitq_params *params, + struct idpf_tx_buf *first) +{ + struct libeth_sq_napi_stats ss = { }; + struct idpf_tx_buf *tx_buf = first; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + u32 idx = 0; + + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + do { + libeth_tx_complete(tx_buf, &cp); + idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + } while (idpf_tx_buf_compl_tag(tx_buf) == params->compl_tag); + + /* Update tail in case netdev_xmit_more was previously true. */ + idpf_tx_buf_hw_update(txq, params->prev_ntu, false); +} + /** * idpf_tx_splitq_map - Build the Tx flex descriptor * @tx_q: queue to send buffer on @@ -2482,8 +2462,9 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); first->nr_frags++; idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; @@ -2939,7 +2920,9 @@ static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q) static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q) { - struct idpf_tx_splitq_params tx_params = { }; + struct idpf_tx_splitq_params tx_params = { + .prev_ntu = tx_q->next_to_use, + }; union idpf_flex_tx_ctx_desc *ctx_desc; struct idpf_tx_buf *first; unsigned int count; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index c75ca5d3e57c..a7632d845a2a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -196,6 +196,7 @@ struct idpf_tx_offload_params { * @compl_tag: Associated tag for completion * @td_tag: Descriptor tunneling tag * @offload: Offload parameters + * @prev_ntu: stored TxQ next_to_use in case of rollback */ struct idpf_tx_splitq_params { enum idpf_tx_desc_dtype_value dtype; @@ -206,6 +207,8 @@ struct idpf_tx_splitq_params { }; struct idpf_tx_offload_params offload; + + u16 prev_ntu; }; enum idpf_tx_ctx_desc_eipt_offload { @@ -1042,8 +1045,6 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, bool xmit_more); unsigned int idpf_size_to_txd_count(unsigned int size); netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb); -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 ring_idx); unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, struct sk_buff *skb); void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue); -- cgit v1.2.3 From 5f417d551324d2894168b362f2429d120ab06243 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:21 -0700 Subject: idpf: replace flow scheduling buffer ring with buffer pool Replace the TxQ buffer ring with one large pool/array of buffers (only for flow scheduling). This eliminates the tag generation and makes it impossible for a tag to be associated with more than one packet. The completion tag passed to HW through the descriptor is the index into the array. That same completion tag is posted back to the driver in the completion descriptor, and used to index into the array to quickly retrieve the buffer during cleaning. In this way, the tags are treated as a fix sized resource. If all tags are in use, no more packets can be sent on that particular queue (until some are freed up). The tag pool size is 64K since the completion tag width is 16 bits. For each packet, the driver pulls a free tag from the refillq to get the next free buffer index. When cleaning is complete, the tag is posted back to the refillq. A multi-frag packet spans multiple buffers in the driver, therefore it uses multiple buffer indexes/tags from the pool. Each frag pulls from the refillq to get the next free buffer index. These are tracked in a next_buf field that replaces the completion tag field in the buffer struct. This chains the buffers together so that the packet can be cleaned from the starting completion tag taken from the completion descriptor, then from the next_buf field for each subsequent buffer. In case of a dma_mapping_error occurs or the refillq runs out of free buf_ids, the packet will execute the rollback error path. This unmaps any buffers previously mapped for the packet. Since several free buf_ids could have already been pulled from the refillq, we need to restore its original state as well. Otherwise, the buf_ids/tags will be leaked and not used again until the queue is reallocated. Descriptor completions only advance the descriptor ring index to "clean" the descriptors. The packet completions only clean the buffers associated with the given packet completion tag and do not update the descriptor ring index. When operating in queue based scheduling mode, the array still acts as a ring and will only have TxQ descriptor count entries. The tx_bufs are still associated 1:1 with the descriptor ring entries and we can use the conventional indexing mechanisms. Fixes: c2d548cad150 ("idpf: add TX splitq napi poll support") Signed-off-by: Luigi Rizzo Signed-off-by: Brian Vazquez Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 204 +++++++++++++--------------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 10 +- 2 files changed, 103 insertions(+), 111 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 527d56bcbbef..5fe329a7c944 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -13,6 +13,7 @@ struct idpf_tx_stash { struct libeth_sqe buf; }; +#define idpf_tx_buf_next(buf) (*(u32 *)&(buf)->priv) #define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) LIBETH_SQE_CHECK_PRIV(u32); @@ -91,7 +92,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) return; /* Free all the Tx buffer sk_buffs */ - for (i = 0; i < txq->desc_count; i++) + for (i = 0; i < txq->buf_pool_size; i++) libeth_tx_complete(&txq->tx_buf[i], &cp); kfree(txq->tx_buf); @@ -199,14 +200,17 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport) static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) { struct idpf_buf_lifo *buf_stack; - int buf_size; int i; /* Allocate book keeping buffers only. Buffers to be supplied to HW * are allocated by kernel network stack and received as part of skb */ - buf_size = sizeof(struct idpf_tx_buf) * tx_q->desc_count; - tx_q->tx_buf = kzalloc(buf_size, GFP_KERNEL); + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) + tx_q->buf_pool_size = U16_MAX; + else + tx_q->buf_pool_size = tx_q->desc_count; + tx_q->tx_buf = kcalloc(tx_q->buf_pool_size, sizeof(*tx_q->tx_buf), + GFP_KERNEL); if (!tx_q->tx_buf) return -ENOMEM; @@ -275,7 +279,7 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, return 0; refillq = tx_q->refillq; - refillq->desc_count = tx_q->desc_count; + refillq->desc_count = tx_q->buf_pool_size; refillq->ring = kcalloc(refillq->desc_count, sizeof(u32), GFP_KERNEL); if (!refillq->ring) { @@ -1869,6 +1873,12 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, struct idpf_tx_buf *tx_buf; bool clean_complete = true; + if (descs_only) { + /* Bump ring index to mark as cleaned. */ + tx_q->next_to_clean = end; + return true; + } + tx_desc = &tx_q->flex_tx[ntc]; next_pending_desc = &tx_q->flex_tx[end]; tx_buf = &tx_q->tx_buf[ntc]; @@ -1935,87 +1945,43 @@ do { \ } while (0) /** - * idpf_tx_clean_buf_ring - clean flow scheduling TX queue buffers + * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) + * @buf_id: packet's starting buffer ID, from completion descriptor * @cleaned: pointer to stats struct to track cleaned packets/bytes * @budget: Used to determine if we are in netpoll * - * Cleans all buffers associated with the input completion tag either from the - * TX buffer ring or from the hash table if the buffers were previously - * stashed. Returns the byte/segment count for the cleaned packet associated - * this completion tag. + * Clean all buffers associated with the packet starting at buf_id. Returns the + * byte/segment count for the cleaned packet. */ -static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, - struct libeth_sq_napi_stats *cleaned, - int budget) +static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, + struct libeth_sq_napi_stats *cleaned, + int budget) { - u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = cleaned, .napi = budget, }; - u16 ntc, orig_idx = idx; - - tx_buf = &txq->tx_buf[idx]; - - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX || - idpf_tx_buf_compl_tag(tx_buf) != compl_tag)) - return false; + tx_buf = &txq->tx_buf[buf_id]; if (tx_buf->type == LIBETH_SQE_SKB) { if (skb_shinfo(tx_buf->skb)->tx_flags & SKBTX_IN_PROGRESS) idpf_tx_read_tstamp(txq, tx_buf->skb); libeth_tx_complete(tx_buf, &cp); + idpf_post_buf_refill(txq->refillq, buf_id); } - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + buf_id = idpf_tx_buf_next(tx_buf); - while (idpf_tx_buf_compl_tag(tx_buf) == compl_tag) { + tx_buf = &txq->tx_buf[buf_id]; libeth_tx_complete(tx_buf, &cp); - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + idpf_post_buf_refill(txq->refillq, buf_id); } - /* - * It's possible the packet we just cleaned was an out of order - * completion, which means we can stash the buffers starting from - * the original next_to_clean and reuse the descriptors. We need - * to compare the descriptor ring next_to_clean packet's "first" buffer - * to the "first" buffer of the packet we just cleaned to determine if - * this is the case. Howevever, next_to_clean can point to either a - * reserved buffer that corresponds to a context descriptor used for the - * next_to_clean packet (TSO packet) or the "first" buffer (single - * packet). The orig_idx from the packet we just cleaned will always - * point to the "first" buffer. If next_to_clean points to a reserved - * buffer, let's bump ntc once and start the comparison from there. - */ - ntc = txq->next_to_clean; - tx_buf = &txq->tx_buf[ntc]; - - if (tx_buf->type == LIBETH_SQE_CTX) - idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf); - - /* - * If ntc still points to a different "first" buffer, clean the - * descriptor ring and stash all of the buffers for later cleaning. If - * we cannot stash all of the buffers, next_to_clean will point to the - * "first" buffer of the packet that could not be stashed and cleaning - * will start there next time. - */ - if (unlikely(tx_buf != &txq->tx_buf[orig_idx] && - !idpf_tx_splitq_clean(txq, orig_idx, budget, cleaned, - true))) - return true; - - /* - * Otherwise, update next_to_clean to reflect the cleaning that was - * done above. - */ - txq->next_to_clean = idx; - return true; } @@ -2046,12 +2012,10 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); - idpf_post_buf_refill(txq->refillq, compl_tag); - /* If we didn't clean anything on the ring, this packet must be * in the hash table. Go clean it there. */ - if (!idpf_tx_clean_buf_ring(txq, compl_tag, cleaned, budget)) + if (!idpf_tx_clean_bufs(txq, compl_tag, cleaned, budget)) idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget); } @@ -2364,7 +2328,7 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) * Return: true if a buffer ID was found, false if not */ static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, - u16 *buf_id) + u32 *buf_id) { u32 ntc = refillq->next_to_clean; u32 refill_desc; @@ -2397,25 +2361,34 @@ static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq, struct idpf_tx_splitq_params *params, struct idpf_tx_buf *first) { + struct idpf_sw_queue *refillq = txq->refillq; struct libeth_sq_napi_stats ss = { }; struct idpf_tx_buf *tx_buf = first; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - u32 idx = 0; u64_stats_update_begin(&txq->stats_sync); u64_stats_inc(&txq->q_stats.dma_map_errs); u64_stats_update_end(&txq->stats_sync); - do { + libeth_tx_complete(tx_buf, &cp); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + tx_buf = &txq->tx_buf[idpf_tx_buf_next(tx_buf)]; libeth_tx_complete(tx_buf, &cp); - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); - } while (idpf_tx_buf_compl_tag(tx_buf) == params->compl_tag); + } /* Update tail in case netdev_xmit_more was previously true. */ idpf_tx_buf_hw_update(txq, params->prev_ntu, false); + + if (!refillq) + return; + + /* Restore refillq state to avoid leaking tags. */ + if (params->prev_refill_gen != idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = params->prev_refill_ntc; } /** @@ -2439,6 +2412,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, struct netdev_queue *nq; struct sk_buff *skb; skb_frag_t *frag; + u32 next_buf_id; u16 td_cmd = 0; dma_addr_t dma; @@ -2456,18 +2430,16 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, tx_buf = first; first->nr_frags = 0; - params->compl_tag = - (tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i; - for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (unlikely(dma_mapping_error(tx_q->dev, dma))) + if (unlikely(dma_mapping_error(tx_q->dev, dma))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; return idpf_tx_splitq_pkt_err_unmap(tx_q, params, first); + } first->nr_frags++; - idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; tx_buf->type = LIBETH_SQE_FRAG; /* record length, and DMA address */ @@ -2523,29 +2495,14 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, max_data); if (unlikely(++i == tx_q->desc_count)) { - tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { - tx_buf++; tx_desc++; } - /* Since this packet has a buffer that is going to span - * multiple descriptors, it's going to leave holes in - * to the TX buffer ring. To ensure these holes do not - * cause issues in the cleaning routines, we will clear - * them of any stale data and assign them the same - * completion tag as the current packet. Then when the - * packet is being cleaned, the cleaning routines will - * simply pass over these holes and finish cleaning the - * rest of the packet. - */ - tx_buf->type = LIBETH_SQE_EMPTY; - idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; - /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, * max_data will be >= 12K and <= 16K-1. On any @@ -2570,15 +2527,26 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); if (unlikely(++i == tx_q->desc_count)) { - tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { - tx_buf++; tx_desc++; } + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &next_buf_id))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); + } + } else { + next_buf_id = i; + } + idpf_tx_buf_next(tx_buf) = next_buf_id; + tx_buf = &tx_q->tx_buf[next_buf_id]; + size = skb_frag_size(frag); data_len -= size; @@ -2593,6 +2561,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, /* write last descriptor with RS and EOP bits */ first->rs_idx = i; + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; td_cmd |= params->eop_cmd; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); i = idpf_tx_splitq_bump_ntu(tx_q, i); @@ -2801,8 +2770,6 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq) union idpf_flex_tx_ctx_desc *desc; int i = txq->next_to_use; - txq->tx_buf[i].type = LIBETH_SQE_CTX; - /* grab the next descriptor */ desc = &txq->flex_ctx[i]; txq->next_to_use = idpf_tx_splitq_bump_ntu(txq, i); @@ -2927,6 +2894,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, struct idpf_tx_buf *first; unsigned int count; int tso, idx; + u32 buf_id; count = idpf_tx_desc_count_required(tx_q, skb); if (unlikely(!count)) @@ -2970,26 +2938,28 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, idpf_tx_set_tstamp_desc(ctx_desc, idx); } - /* record the location of the first descriptor for this packet */ - first = &tx_q->tx_buf[tx_q->next_to_use]; - first->skb = skb; + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + struct idpf_sw_queue *refillq = tx_q->refillq; - if (tso) { - first->packets = tx_params.offload.tso_segs; - first->bytes = skb->len + - ((first->packets - 1) * tx_params.offload.tso_hdr_len); - } else { - first->packets = 1; - first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); - } + /* Save refillq state in case of a packet rollback. Otherwise, + * the tags will be leaked since they will be popped from the + * refillq but never reposted during cleaning. + */ + tx_params.prev_refill_gen = + idpf_queue_has(RFL_GEN_CHK, refillq); + tx_params.prev_refill_ntc = refillq->next_to_clean; - if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, - &tx_params.compl_tag))) { - u64_stats_update_begin(&tx_q->stats_sync); - u64_stats_inc(&tx_q->q_stats.q_busy); - u64_stats_update_end(&tx_q->stats_sync); + &buf_id))) { + if (tx_params.prev_refill_gen != + idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = tx_params.prev_refill_ntc; + + tx_q->next_to_use = tx_params.prev_ntu; + return idpf_tx_drop_skb(tx_q, skb); } + tx_params.compl_tag = buf_id; tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; @@ -3008,6 +2978,8 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.offload.td_cmd |= IDPF_TXD_FLEX_FLOW_CMD_CS_EN; } else { + buf_id = tx_q->next_to_use; + tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2; tx_params.eop_cmd = IDPF_TXD_LAST_DESC_CMD; @@ -3015,6 +2987,18 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.offload.td_cmd |= IDPF_TX_FLEX_DESC_CMD_CS_EN; } + first = &tx_q->tx_buf[buf_id]; + first->skb = skb; + + if (tso) { + first->packets = tx_params.offload.tso_segs; + first->bytes = skb->len + + ((first->packets - 1) * tx_params.offload.tso_hdr_len); + } else { + first->packets = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + } + idpf_tx_splitq_map(tx_q, &tx_params, first); return NETDEV_TX_OK; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index a7632d845a2a..d86246c320c8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -137,6 +137,8 @@ do { \ ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ 0 : (txq)->compl_tag_cur_gen) +#define IDPF_TXBUF_NULL U32_MAX + #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS) #define IDPF_TX_FLAGS_TSO BIT(0) @@ -197,6 +199,8 @@ struct idpf_tx_offload_params { * @td_tag: Descriptor tunneling tag * @offload: Offload parameters * @prev_ntu: stored TxQ next_to_use in case of rollback + * @prev_refill_ntc: stored refillq next_to_clean in case of packet rollback + * @prev_refill_gen: stored refillq generation bit in case of packet rollback */ struct idpf_tx_splitq_params { enum idpf_tx_desc_dtype_value dtype; @@ -209,6 +213,8 @@ struct idpf_tx_splitq_params { struct idpf_tx_offload_params offload; u16 prev_ntu; + u16 prev_refill_ntc; + bool prev_refill_gen; }; enum idpf_tx_ctx_desc_eipt_offload { @@ -638,6 +644,7 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64, * @size: Length of descriptor ring in bytes * @dma: Physical address of ring * @q_vector: Backreference to associated vector + * @buf_pool_size: Total number of idpf_tx_buf */ struct idpf_tx_queue { __cacheline_group_begin_aligned(read_mostly); @@ -696,11 +703,12 @@ struct idpf_tx_queue { dma_addr_t dma; struct idpf_q_vector *q_vector; + u32 buf_pool_size; __cacheline_group_end_aligned(cold); }; libeth_cacheline_set_assert(struct idpf_tx_queue, 64, 120 + sizeof(struct u64_stats_sync), - 24); + 32); /** * struct idpf_buf_queue - software structure representing a buffer queue -- cgit v1.2.3 From 0c3f135e840d4a2ba4253e15d530ec61bc30718e Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:22 -0700 Subject: idpf: stop Tx if there are insufficient buffer resources The Tx refillq logic will cause packets to be silently dropped if there are not enough buffer resources available to send a packet in flow scheduling mode. Instead, determine how many buffers are needed along with number of descriptors. Make sure there are enough of both resources to send the packet, and stop the queue if not. Fixes: 7292af042bcf ("idpf: fix a race in txq wakeup") Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/idpf/idpf_singleq_txrx.c | 4 +- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 47 +++++++++++++++------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 15 ++++++- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 57c0f5ab8f9e..b19b462e0bb6 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -415,11 +415,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, { struct idpf_tx_offload_params offload = { }; struct idpf_tx_buf *first; + u32 count, buf_count = 1; int csum, tso, needed; - unsigned int count; __be16 protocol; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 5fe329a7c944..fa5432a0566a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2191,15 +2191,22 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc, desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag); } -/* Global conditions to tell whether the txq (and related resources) - * has room to allow the use of "size" descriptors. +/** + * idpf_tx_splitq_has_room - check if enough Tx splitq resources are available + * @tx_q: the queue to be checked + * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of Tx buffers required for this packet + * + * Return: 0 if no room available, 1 otherwise */ -static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) +static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed, + u32 bufs_needed) { - if (IDPF_DESC_UNUSED(tx_q) < size || + if (IDPF_DESC_UNUSED(tx_q) < descs_needed || IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || - IDPF_TX_BUF_RSV_LOW(tx_q)) + IDPF_TX_BUF_RSV_LOW(tx_q) || + idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed) return 0; return 1; } @@ -2208,14 +2215,21 @@ static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions * @tx_q: the queue to be checked * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of buffers needed for this packet * - * Returns 0 if stop is not needed + * Return: 0 if stop is not needed */ static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q, - unsigned int descs_needed) + u32 descs_needed, + u32 bufs_needed) { + /* Since we have multiple resources to check for splitq, our + * start,stop_thrs becomes a boolean check instead of a count + * threshold. + */ if (netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, - idpf_txq_has_room(tx_q, descs_needed), + idpf_txq_has_room(tx_q, descs_needed, + bufs_needed), 1, 1)) return 0; @@ -2257,14 +2271,16 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, } /** - * idpf_tx_desc_count_required - calculate number of Tx descriptors needed + * idpf_tx_res_count_required - get number of Tx resources needed for this pkt * @txq: queue to send buffer on * @skb: send buffer + * @bufs_needed: (output) number of buffers needed for this skb. * - * Returns number of data descriptors needed for this skb. + * Return: number of data descriptors and buffers needed for this skb. */ -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb) +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, + u32 *bufs_needed) { const struct skb_shared_info *shinfo; unsigned int count = 0, i; @@ -2275,6 +2291,7 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, return count; shinfo = skb_shinfo(skb); + *bufs_needed += shinfo->nr_frags; for (i = 0; i < shinfo->nr_frags; i++) { unsigned int size; @@ -2892,11 +2909,11 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, }; union idpf_flex_tx_ctx_desc *ctx_desc; struct idpf_tx_buf *first; - unsigned int count; + u32 count, buf_count = 1; int tso, idx; u32 buf_id; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); @@ -2906,7 +2923,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, /* Check for splitq specific TX resources */ count += (IDPF_TX_DESCS_PER_CACHE_LINE + tso); - if (idpf_tx_maybe_stop_splitq(tx_q, count)) { + if (idpf_tx_maybe_stop_splitq(tx_q, count, buf_count)) { idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false); return NETDEV_TX_BUSY; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index d86246c320c8..9565e4dc3514 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -1026,6 +1026,17 @@ static inline void idpf_vport_intr_set_wb_on_itr(struct idpf_q_vector *q_vector) reg->dyn_ctl); } +/** + * idpf_tx_splitq_get_free_bufs - get number of free buf_ids in refillq + * @refillq: pointer to refillq containing buf_ids + */ +static inline u32 idpf_tx_splitq_get_free_bufs(struct idpf_sw_queue *refillq) +{ + return (refillq->next_to_use > refillq->next_to_clean ? + 0 : refillq->desc_count) + + refillq->next_to_use - refillq->next_to_clean - 1; +} + int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget); void idpf_vport_init_num_qs(struct idpf_vport *vport, struct virtchnl2_create_vport *vport_msg); @@ -1053,8 +1064,8 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, bool xmit_more); unsigned int idpf_size_to_txd_count(unsigned int size); netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb); -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb); +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, u32 *buf_count); void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue); netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q); -- cgit v1.2.3 From 6c4e68480238274f84aa50d54da0d9e262df6284 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:23 -0700 Subject: idpf: remove obsolete stashing code With the new Tx buffer management scheme, there is no need for all of the stashing mechanisms, the hash table, the reserve buffer stack, etc. Remove all of that. Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 314 ++-------------------------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 47 +---- 2 files changed, 22 insertions(+), 339 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index fa5432a0566a..eaad52a83b04 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -8,48 +8,12 @@ #include "idpf_ptp.h" #include "idpf_virtchnl.h" -struct idpf_tx_stash { - struct hlist_node hlist; - struct libeth_sqe buf; -}; - #define idpf_tx_buf_next(buf) (*(u32 *)&(buf)->priv) -#define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) LIBETH_SQE_CHECK_PRIV(u32); static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); -/** - * idpf_buf_lifo_push - push a buffer pointer onto stack - * @stack: pointer to stack struct - * @buf: pointer to buf to push - * - * Returns 0 on success, negative on failure - **/ -static int idpf_buf_lifo_push(struct idpf_buf_lifo *stack, - struct idpf_tx_stash *buf) -{ - if (unlikely(stack->top == stack->size)) - return -ENOSPC; - - stack->bufs[stack->top++] = buf; - - return 0; -} - -/** - * idpf_buf_lifo_pop - pop a buffer pointer from stack - * @stack: pointer to stack struct - **/ -static struct idpf_tx_stash *idpf_buf_lifo_pop(struct idpf_buf_lifo *stack) -{ - if (unlikely(!stack->top)) - return NULL; - - return stack->bufs[--stack->top]; -} - /** * idpf_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure @@ -78,14 +42,11 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue) static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { struct libeth_sq_napi_stats ss = { }; - struct idpf_buf_lifo *buf_stack; - struct idpf_tx_stash *stash; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - struct hlist_node *tmp; - u32 i, tag; + u32 i; /* Buffers already cleared, nothing to do */ if (!txq->tx_buf) @@ -97,33 +58,6 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) kfree(txq->tx_buf); txq->tx_buf = NULL; - - if (!idpf_queue_has(FLOW_SCH_EN, txq)) - return; - - buf_stack = &txq->stash->buf_stack; - if (!buf_stack->bufs) - return; - - /* - * If a Tx timeout occurred, there are potentially still bufs in the - * hash table, free them here. - */ - hash_for_each_safe(txq->stash->sched_buf_hash, tag, tmp, stash, - hlist) { - if (!stash) - continue; - - libeth_tx_complete(&stash->buf, &cp); - hash_del(&stash->hlist); - idpf_buf_lifo_push(buf_stack, stash); - } - - for (i = 0; i < buf_stack->size; i++) - kfree(buf_stack->bufs[i]); - - kfree(buf_stack->bufs); - buf_stack->bufs = NULL; } /** @@ -199,9 +133,6 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport) */ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) { - struct idpf_buf_lifo *buf_stack; - int i; - /* Allocate book keeping buffers only. Buffers to be supplied to HW * are allocated by kernel network stack and received as part of skb */ @@ -214,29 +145,6 @@ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) if (!tx_q->tx_buf) return -ENOMEM; - if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) - return 0; - - buf_stack = &tx_q->stash->buf_stack; - - /* Initialize tx buf stack for out-of-order completions if - * flow scheduling offload is enabled - */ - buf_stack->bufs = kcalloc(tx_q->desc_count, sizeof(*buf_stack->bufs), - GFP_KERNEL); - if (!buf_stack->bufs) - return -ENOMEM; - - buf_stack->size = tx_q->desc_count; - buf_stack->top = tx_q->desc_count; - - for (i = 0; i < tx_q->desc_count; i++) { - buf_stack->bufs[i] = kzalloc(sizeof(*buf_stack->bufs[i]), - GFP_KERNEL); - if (!buf_stack->bufs[i]) - return -ENOMEM; - } - return 0; } @@ -350,8 +258,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) for (i = 0; i < vport->num_txq_grp; i++) { for (j = 0; j < vport->txq_grps[i].num_txq; j++) { struct idpf_tx_queue *txq = vport->txq_grps[i].txqs[j]; - u8 gen_bits = 0; - u16 bufidx_mask; err = idpf_tx_desc_alloc(vport, txq); if (err) { @@ -360,34 +266,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) i); goto err_out; } - - if (!idpf_is_queue_model_split(vport->txq_model)) - continue; - - txq->compl_tag_cur_gen = 0; - - /* Determine the number of bits in the bufid - * mask and add one to get the start of the - * generation bits - */ - bufidx_mask = txq->desc_count - 1; - while (bufidx_mask >> 1) { - txq->compl_tag_gen_s++; - bufidx_mask = bufidx_mask >> 1; - } - txq->compl_tag_gen_s++; - - gen_bits = IDPF_TX_SPLITQ_COMPL_TAG_WIDTH - - txq->compl_tag_gen_s; - txq->compl_tag_gen_max = GETMAXVAL(gen_bits); - - /* Set bufid mask based on location of first - * gen bit; it cannot simply be the descriptor - * ring size-1 since we can have size values - * where not all of those bits are set. - */ - txq->compl_tag_bufid_m = - GETMAXVAL(txq->compl_tag_gen_s); } if (!idpf_is_queue_model_split(vport->txq_model)) @@ -1042,9 +920,6 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) kfree(txq_grp->complq); txq_grp->complq = NULL; - - if (flow_sch_en) - kfree(txq_grp->stashes); } kfree(vport->txq_grps); vport->txq_grps = NULL; @@ -1405,7 +1280,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) for (i = 0; i < vport->num_txq_grp; i++) { struct idpf_txq_group *tx_qgrp = &vport->txq_grps[i]; struct idpf_adapter *adapter = vport->adapter; - struct idpf_txq_stash *stashes; int j; tx_qgrp->vport = vport; @@ -1418,15 +1292,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) goto err_alloc; } - if (split && flow_sch_en) { - stashes = kcalloc(num_txq, sizeof(*stashes), - GFP_KERNEL); - if (!stashes) - goto err_alloc; - - tx_qgrp->stashes = stashes; - } - for (j = 0; j < tx_qgrp->num_txq; j++) { struct idpf_tx_queue *q = tx_qgrp->txqs[j]; @@ -1446,11 +1311,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) if (!flow_sch_en) continue; - if (split) { - q->stash = &stashes[j]; - hash_init(q->stash->sched_buf_hash); - } - idpf_queue_set(FLOW_SCH_EN, q); q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL); @@ -1742,87 +1602,6 @@ static void idpf_tx_read_tstamp(struct idpf_tx_queue *txq, struct sk_buff *skb) spin_unlock_bh(&tx_tstamp_caps->status_lock); } -/** - * idpf_tx_clean_stashed_bufs - clean bufs that were stored for - * out of order completions - * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) - * @cleaned: pointer to stats struct to track cleaned packets/bytes - * @budget: Used to determine if we are in netpoll - */ -static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, - u16 compl_tag, - struct libeth_sq_napi_stats *cleaned, - int budget) -{ - struct idpf_tx_stash *stash; - struct hlist_node *tmp_buf; - struct libeth_cq_pp cp = { - .dev = txq->dev, - .ss = cleaned, - .napi = budget, - }; - - /* Buffer completion */ - hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, - hlist, compl_tag) { - if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != compl_tag)) - continue; - - hash_del(&stash->hlist); - - if (stash->buf.type == LIBETH_SQE_SKB && - (skb_shinfo(stash->buf.skb)->tx_flags & SKBTX_IN_PROGRESS)) - idpf_tx_read_tstamp(txq, stash->buf.skb); - - libeth_tx_complete(&stash->buf, &cp); - - /* Push shadow buf back onto stack */ - idpf_buf_lifo_push(&txq->stash->buf_stack, stash); - } -} - -/** - * idpf_stash_flow_sch_buffers - store buffer parameters info to be freed at a - * later time (only relevant for flow scheduling mode) - * @txq: Tx queue to clean - * @tx_buf: buffer to store - */ -static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, - struct idpf_tx_buf *tx_buf) -{ - struct idpf_tx_stash *stash; - - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) - return 0; - - stash = idpf_buf_lifo_pop(&txq->stash->buf_stack); - if (unlikely(!stash)) { - net_err_ratelimited("%s: No out-of-order TX buffers left!\n", - netdev_name(txq->netdev)); - - return -ENOMEM; - } - - /* Store buffer params in shadow buffer */ - stash->buf.skb = tx_buf->skb; - stash->buf.bytes = tx_buf->bytes; - stash->buf.packets = tx_buf->packets; - stash->buf.type = tx_buf->type; - stash->buf.nr_frags = tx_buf->nr_frags; - dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); - dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); - idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf); - - /* Add buffer to buf_hash table to be freed later */ - hash_add(txq->stash->sched_buf_hash, &stash->hlist, - idpf_tx_buf_compl_tag(&stash->buf)); - - tx_buf->type = LIBETH_SQE_EMPTY; - - return 0; -} - #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf) \ do { \ if (unlikely(++(ntc) == (txq)->desc_count)) { \ @@ -1850,14 +1629,8 @@ do { \ * Separate packet completion events will be reported on the completion queue, * and the buffers will be cleaned separately. The stats are not updated from * this function when using flow-based scheduling. - * - * Furthermore, in flow scheduling mode, check to make sure there are enough - * reserve buffers to stash the packet. If there are not, return early, which - * will leave next_to_clean pointing to the packet that failed to be stashed. - * - * Return: false in the scenario above, true otherwise. */ -static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, +static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, struct libeth_sq_napi_stats *cleaned, bool descs_only) @@ -1871,12 +1644,11 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, .napi = napi_budget, }; struct idpf_tx_buf *tx_buf; - bool clean_complete = true; if (descs_only) { /* Bump ring index to mark as cleaned. */ tx_q->next_to_clean = end; - return true; + return; } tx_desc = &tx_q->flex_tx[ntc]; @@ -1897,53 +1669,24 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, break; eop_idx = tx_buf->rs_idx; + libeth_tx_complete(tx_buf, &cp); - if (descs_only) { - if (IDPF_TX_BUF_RSV_UNUSED(tx_q) < tx_buf->nr_frags) { - clean_complete = false; - goto tx_splitq_clean_out; - } - - idpf_stash_flow_sch_buffers(tx_q, tx_buf); + /* unmap remaining buffers */ + while (ntc != eop_idx) { + idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, + tx_desc, tx_buf); - while (ntc != eop_idx) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - idpf_stash_flow_sch_buffers(tx_q, tx_buf); - } - } else { + /* unmap any remaining paged data */ libeth_tx_complete(tx_buf, &cp); - - /* unmap remaining buffers */ - while (ntc != eop_idx) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - - /* unmap any remaining paged data */ - libeth_tx_complete(tx_buf, &cp); - } } fetch_next_txq_desc: idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); } -tx_splitq_clean_out: tx_q->next_to_clean = ntc; - - return clean_complete; } -#define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf) \ -do { \ - (buf)++; \ - (ntc)++; \ - if (unlikely((ntc) == (txq)->desc_count)) { \ - buf = (txq)->tx_buf; \ - ntc = 0; \ - } \ -} while (0) - /** * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers * @txq: queue to clean @@ -1954,7 +1697,7 @@ do { \ * Clean all buffers associated with the packet starting at buf_id. Returns the * byte/segment count for the cleaned packet. */ -static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, +static void idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, struct libeth_sq_napi_stats *cleaned, int budget) { @@ -1981,8 +1724,6 @@ static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, libeth_tx_complete(tx_buf, &cp); idpf_post_buf_refill(txq->refillq, buf_id); } - - return true; } /** @@ -2001,22 +1742,17 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, struct libeth_sq_napi_stats *cleaned, int budget) { - u16 compl_tag; + /* RS completion contains queue head for queue based scheduling or + * completion tag for flow based scheduling. + */ + u16 rs_compl_val = le16_to_cpu(desc->q_head_compl_tag.q_head); if (!idpf_queue_has(FLOW_SCH_EN, txq)) { - u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head); - - idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + idpf_tx_splitq_clean(txq, rs_compl_val, budget, cleaned, false); return; } - compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); - - /* If we didn't clean anything on the ring, this packet must be - * in the hash table. Go clean it there. - */ - if (!idpf_tx_clean_bufs(txq, compl_tag, cleaned, budget)) - idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget); + idpf_tx_clean_bufs(txq, rs_compl_val, cleaned, budget); } /** @@ -2133,8 +1869,7 @@ fetch_next_desc: /* Update BQL */ nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - dont_wake = !complq_ok || IDPF_TX_BUF_RSV_LOW(tx_q) || - np->state != __IDPF_VPORT_UP || + dont_wake = !complq_ok || np->state != __IDPF_VPORT_UP || !netif_carrier_ok(tx_q->netdev); /* Check if the TXQ needs to and can be restarted */ __netif_txq_completed_wake(nq, tx_q->cleaned_pkts, tx_q->cleaned_bytes, @@ -2205,7 +1940,6 @@ static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed, if (IDPF_DESC_UNUSED(tx_q) < descs_needed || IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || - IDPF_TX_BUF_RSV_LOW(tx_q) || idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed) return 0; return 1; @@ -2329,10 +2063,8 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) { ntu++; - if (ntu == txq->desc_count) { + if (ntu == txq->desc_count) ntu = 0; - txq->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(txq); - } return ntu; } @@ -2514,8 +2246,6 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (unlikely(++i == tx_q->desc_count)) { tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = - IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { tx_desc++; } @@ -2546,7 +2276,6 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (unlikely(++i == tx_q->desc_count)) { tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { tx_desc++; } @@ -2980,10 +2709,9 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; - /* Set the RE bit to catch any packets that may have not been - * stashed during RS completion cleaning. MIN_GAP is set to - * MIN_RING size to ensure it will be set at least once each - * time around the ring. + /* Set the RE bit to periodically "clean" the descriptor ring. + * MIN_GAP is set to MIN_RING size to ensure it will be set at + * least once each time around the ring. */ if (idpf_tx_splitq_need_re(tx_q)) { tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 9565e4dc3514..52753dff381c 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -118,10 +118,6 @@ do { \ ((((txq)->next_to_clean > (txq)->next_to_use) ? 0 : (txq)->desc_count) + \ (txq)->next_to_clean - (txq)->next_to_use - 1) -#define IDPF_TX_BUF_RSV_UNUSED(txq) ((txq)->stash->buf_stack.top) -#define IDPF_TX_BUF_RSV_LOW(txq) (IDPF_TX_BUF_RSV_UNUSED(txq) < \ - (txq)->desc_count >> 2) - #define IDPF_TX_COMPLQ_OVERFLOW_THRESH(txcq) ((txcq)->desc_count >> 1) /* Determine the absolute number of completions pending, i.e. the number of * completions that are expected to arrive on the TX completion queue. @@ -131,12 +127,6 @@ do { \ 0 : U32_MAX) + \ (txq)->num_completions_pending - (txq)->complq->num_completions) -#define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 -/* Adjust the generation for the completion tag and wrap if necessary */ -#define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \ - ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ - 0 : (txq)->compl_tag_cur_gen) - #define IDPF_TXBUF_NULL U32_MAX #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS) @@ -154,18 +144,6 @@ union idpf_tx_flex_desc { #define idpf_tx_buf libeth_sqe -/** - * struct idpf_buf_lifo - LIFO for managing OOO completions - * @top: Used to know how many buffers are left - * @size: Total size of LIFO - * @bufs: Backing array - */ -struct idpf_buf_lifo { - u16 top; - u16 size; - struct idpf_tx_stash **bufs; -}; - /** * struct idpf_tx_offload_params - Offload parameters for a given packet * @tx_flags: Feature flags enabled for this packet @@ -476,17 +454,6 @@ struct idpf_tx_queue_stats { #define IDPF_ITR_IDX_SPACING(spacing, dflt) (spacing ? spacing : dflt) #define IDPF_DIM_DEFAULT_PROFILE_IX 1 -/** - * struct idpf_txq_stash - Tx buffer stash for Flow-based scheduling mode - * @buf_stack: Stack of empty buffers to store buffer info for out of order - * buffer completions. See struct idpf_buf_lifo - * @sched_buf_hash: Hash table to store buffers - */ -struct idpf_txq_stash { - struct idpf_buf_lifo buf_stack; - DECLARE_HASHTABLE(sched_buf_hash, 12); -} ____cacheline_aligned; - /** * struct idpf_rx_queue - software structure representing a receive queue * @rx: universal receive descriptor array @@ -631,11 +598,7 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64, * only once at the end of the cleaning routine. * @clean_budget: singleq only, queue cleaning budget * @cleaned_pkts: Number of packets cleaned for the above said case - * @stash: Tx buffer stash for Flow-based scheduling mode * @refillq: Pointer to refill queue - * @compl_tag_bufid_m: Completion tag buffer id mask - * @compl_tag_cur_gen: Used to keep track of current completion tag generation - * @compl_tag_gen_max: To determine when compl_tag_cur_gen should be reset * @cached_tstamp_caps: Tx timestamp capabilities negotiated with the CP * @tstamp_task: Work that handles Tx timestamp read * @stats_sync: See struct u64_stats_sync @@ -666,7 +629,6 @@ struct idpf_tx_queue { u16 desc_count; u16 tx_min_pkt_len; - u16 compl_tag_gen_s; struct net_device *netdev; __cacheline_group_end_aligned(read_mostly); @@ -683,13 +645,8 @@ struct idpf_tx_queue { }; u16 cleaned_pkts; - struct idpf_txq_stash *stash; struct idpf_sw_queue *refillq; - u16 compl_tag_bufid_m; - u16 compl_tag_cur_gen; - u16 compl_tag_gen_max; - struct idpf_ptp_vport_tx_tstamp_caps *cached_tstamp_caps; struct work_struct *tstamp_task; @@ -707,7 +664,7 @@ struct idpf_tx_queue { __cacheline_group_end_aligned(cold); }; libeth_cacheline_set_assert(struct idpf_tx_queue, 64, - 120 + sizeof(struct u64_stats_sync), + 104 + sizeof(struct u64_stats_sync), 32); /** @@ -918,7 +875,6 @@ struct idpf_rxq_group { * @vport: Vport back pointer * @num_txq: Number of TX queues associated * @txqs: Array of TX queue pointers - * @stashes: array of OOO stashes for the queues * @complq: Associated completion queue pointer, split queue only * @num_completions_pending: Total number of completions pending for the * completion queue, acculumated for all TX queues @@ -933,7 +889,6 @@ struct idpf_txq_group { u16 num_txq; struct idpf_tx_queue *txqs[IDPF_LARGE_MAX_Q]; - struct idpf_txq_stash *stashes; struct idpf_compl_queue *complq; -- cgit v1.2.3 From b08a784a5d1495c42ff9b0c70887d49211cddfe0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 18 Aug 2025 19:03:54 +0100 Subject: net: Introduce skb_copy_datagram_from_iter_full() In a similar manner to copy_from_iter()/copy_from_iter_full(), introduce skb_copy_datagram_from_iter_full() which reverts the iterator to its initial state when returning an error. A subsequent fix for a vsock regression will make use of this new function. Cc: Christian Brauner Cc: Alexander Viro Signed-off-by: Will Deacon Acked-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Link: https://patch.msgid.link/20250818180355.29275-2-will@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ net/core/datagram.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 14b923ddb6df..fa633657e4c0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4172,6 +4172,8 @@ int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); +int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, + struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); diff --git a/net/core/datagram.c b/net/core/datagram.c index 94cc4705e91d..f474b9b120f9 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -618,6 +618,20 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); +int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, + struct iov_iter *from, int len) +{ + struct iov_iter_state state; + int ret; + + iov_iter_save_state(from, &state); + ret = skb_copy_datagram_from_iter(skb, offset, from, len); + if (ret) + iov_iter_restore(from, &state); + return ret; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iter_full); + int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { -- cgit v1.2.3 From 7fb1291257ea1e27dbc3f34c6a37b4d640aafdd7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 18 Aug 2025 19:03:55 +0100 Subject: vsock/virtio: Fix message iterator handling on transmit path Commit 6693731487a8 ("vsock/virtio: Allocate nonlinear SKBs for handling large transmit buffers") converted the virtio vsock transmit path to utilise nonlinear SKBs when handling large buffers. As part of this change, virtio_transport_fill_skb() was updated to call skb_copy_datagram_from_iter() instead of memcpy_from_msg() as the latter expects a single destination buffer and cannot handle nonlinear SKBs correctly. Unfortunately, during this conversion, I overlooked the error case when the copying function returns -EFAULT due to a fault on the input buffer in userspace. In this case, memcpy_from_msg() reverts the iterator to its initial state thanks to copy_from_iter_full() whereas skb_copy_datagram_from_iter() leaves the iterator partially advanced. This results in a WARN_ONCE() from the vsock code, which expects the iterator to stay in sync with the number of bytes transmitted so that virtio_transport_send_pkt_info() can return -EFAULT when it is called again: ------------[ cut here ]------------ 'send_pkt()' returns 0, but 65536 expected WARNING: CPU: 0 PID: 5503 at net/vmw_vsock/virtio_transport_common.c:428 virtio_transport_send_pkt_info+0xd11/0xf00 net/vmw_vsock/virtio_transport_common.c:426 Modules linked in: CPU: 0 UID: 0 PID: 5503 Comm: syz.0.17 Not tainted 6.16.0-syzkaller-12063-g37816488247d #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call virtio_transport_fill_skb_full() to restore the previous iterator behaviour. Cc: Jason Wang Cc: Stefano Garzarella Fixes: 6693731487a8 ("vsock/virtio: Allocate nonlinear SKBs for handling large transmit buffers") Reported-by: syzbot+b4d960daf7a3c7c2b7b1@syzkaller.appspotmail.com Signed-off-by: Will Deacon Acked-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Link: https://patch.msgid.link/20250818180355.29275-3-will@kernel.org Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index fe92e5fa95b4..dcc8a1d5851e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -105,12 +105,14 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, size_t len, bool zcopy) { + struct msghdr *msg = info->msg; + if (zcopy) - return __zerocopy_sg_from_iter(info->msg, NULL, skb, - &info->msg->msg_iter, len, NULL); + return __zerocopy_sg_from_iter(msg, NULL, skb, + &msg->msg_iter, len, NULL); virtio_vsock_skb_put(skb, len); - return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len); + return skb_copy_datagram_from_iter_full(skb, 0, &msg->msg_iter, len); } static void virtio_transport_init_hdr(struct sk_buff *skb, -- cgit v1.2.3 From 01b9128c5db1b470575d07b05b67ffa3cb02ebf1 Mon Sep 17 00:00:00 2001 From: luoguangfei <15388634752@163.com> Date: Tue, 19 Aug 2025 07:25:27 +0800 Subject: net: macb: fix unregister_netdev call order in macb_remove() When removing a macb device, the driver calls phy_exit() before unregister_netdev(). This leads to a WARN from kernfs: ------------[ cut here ]------------ kernfs: can not remove 'attached_dev', no directory WARNING: CPU: 1 PID: 27146 at fs/kernfs/dir.c:1683 Call trace: kernfs_remove_by_name_ns+0xd8/0xf0 sysfs_remove_link+0x24/0x58 phy_detach+0x5c/0x168 phy_disconnect+0x4c/0x70 phylink_disconnect_phy+0x6c/0xc0 [phylink] macb_close+0x6c/0x170 [macb] ... macb_remove+0x60/0x168 [macb] platform_remove+0x5c/0x80 ... The warning happens because the PHY is being exited while the netdev is still registered. The correct order is to unregister the netdev before shutting down the PHY and cleaning up the MDIO bus. Fix this by moving unregister_netdev() ahead of phy_exit() in macb_remove(). Fixes: 8b73fa3ae02b ("net: macb: Added ZynqMP-specific initialization") Signed-off-by: luoguangfei <15388634752@163.com> Link: https://patch.msgid.link/20250818232527.1316-1-15388634752@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 9693f0289435..b29c3beae0b2 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5399,11 +5399,11 @@ static void macb_remove(struct platform_device *pdev) if (dev) { bp = netdev_priv(dev); + unregister_netdev(dev); phy_exit(bp->sgmii_phy); mdiobus_unregister(bp->mii_bus); mdiobus_free(bp->mii_bus); - unregister_netdev(dev); cancel_work_sync(&bp->hresp_err_bh_work); pm_runtime_disable(&pdev->dev); pm_runtime_dont_use_autosuspend(&pdev->dev); -- cgit v1.2.3 From 5d7eba62e5eb68347de59b31b347b24f304cf21c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 22 Aug 2025 13:40:18 -0400 Subject: Bluetooth: hci_conn: Make unacked packet handling more robust This attempts to make unacked packet handling more robust by detecting if there are no connections left then restore all buffers of the respective pool. Fixes: 5638d9ea9c01 ("Bluetooth: hci_conn: Fix not restoring ISO buffer count on disconnect") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 58 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7a879290dd28..e524bb59bff2 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -149,8 +149,6 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_chan_list_flush(conn); - hci_conn_hash_del(hdev, conn); - if (HCI_CONN_HANDLE_UNSET(conn->handle)) ida_free(&hdev->unset_handle_ida, conn->handle); @@ -1152,28 +1150,54 @@ void hci_conn_del(struct hci_conn *conn) disable_delayed_work_sync(&conn->auto_accept_work); disable_delayed_work_sync(&conn->idle_work); - if (conn->type == ACL_LINK) { - /* Unacked frames */ - hdev->acl_cnt += conn->sent; - } else if (conn->type == LE_LINK) { - cancel_delayed_work(&conn->le_conn_timeout); + /* Remove the connection from the list so unacked logic can detect when + * a certain pool is not being utilized. + */ + hci_conn_hash_del(hdev, conn); - if (hdev->le_pkts) - hdev->le_cnt += conn->sent; + /* Handle unacked frames: + * + * - In case there are no connection, or if restoring the buffers + * considered in transist would overflow, restore all buffers to the + * pool. + * - Otherwise restore just the buffers considered in transit for the + * hci_conn + */ + switch (conn->type) { + case ACL_LINK: + if (!hci_conn_num(hdev, ACL_LINK) || + hdev->acl_cnt + conn->sent > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; else hdev->acl_cnt += conn->sent; - } else { - /* Unacked ISO frames */ - if (conn->type == CIS_LINK || - conn->type == BIS_LINK || - conn->type == PA_LINK) { - if (hdev->iso_pkts) - hdev->iso_cnt += conn->sent; - else if (hdev->le_pkts) + break; + case LE_LINK: + cancel_delayed_work(&conn->le_conn_timeout); + + if (hdev->le_pkts) { + if (!hci_conn_num(hdev, LE_LINK) || + hdev->le_cnt + conn->sent > hdev->le_pkts) + hdev->le_cnt = hdev->le_pkts; + else hdev->le_cnt += conn->sent; + } else { + if ((!hci_conn_num(hdev, LE_LINK) && + !hci_conn_num(hdev, ACL_LINK)) || + hdev->acl_cnt + conn->sent > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; else hdev->acl_cnt += conn->sent; } + break; + case CIS_LINK: + case BIS_LINK: + case PA_LINK: + if (!hci_iso_count(hdev) || + hdev->iso_cnt + conn->sent > hdev->iso_pkts) + hdev->iso_cnt = hdev->iso_pkts; + else + hdev->iso_cnt += conn->sent; + break; } skb_queue_purge(&conn->data_q); -- cgit v1.2.3 From 2f050a5392b7a0928bf836d9891df4851463512c Mon Sep 17 00:00:00 2001 From: Ludovico de Nittis Date: Tue, 12 Aug 2025 17:55:26 +0200 Subject: Bluetooth: hci_event: Treat UNKNOWN_CONN_ID on disconnect as success When the host sends an HCI_OP_DISCONNECT command, the controller may respond with the status HCI_ERROR_UNKNOWN_CONN_ID (0x02). E.g. this can happen on resume from suspend, if the link was terminated by the remote device before the event mask was correctly set. This is a btmon snippet that shows the issue: ``` > ACL Data RX: Handle 3 flags 0x02 dlen 12 L2CAP: Disconnection Request (0x06) ident 5 len 4 Destination CID: 65 Source CID: 72 < ACL Data TX: Handle 3 flags 0x00 dlen 12 L2CAP: Disconnection Response (0x07) ident 5 len 4 Destination CID: 65 Source CID: 72 > ACL Data RX: Handle 3 flags 0x02 dlen 12 L2CAP: Disconnection Request (0x06) ident 6 len 4 Destination CID: 64 Source CID: 71 < ACL Data TX: Handle 3 flags 0x00 dlen 12 L2CAP: Disconnection Response (0x07) ident 6 len 4 Destination CID: 64 Source CID: 71 < HCI Command: Set Event Mask (0x03|0x0001) plen 8 Mask: 0x3dbff807fffbffff Inquiry Complete Inquiry Result Connection Complete Connection Request Disconnection Complete Authentication Complete [...] < HCI Command: Disconnect (0x01|0x0006) plen 3 Handle: 3 Address: 78:20:A5:4A:DF:28 (Nintendo Co.,Ltd) Reason: Remote User Terminated Connection (0x13) > HCI Event: Command Status (0x0f) plen 4 Disconnect (0x01|0x0006) ncmd 1 Status: Unknown Connection Identifier (0x02) ``` Currently, the hci_cs_disconnect function treats any non-zero status as a command failure. This can be misleading because the connection is indeed being terminated and the controller is confirming that is has no knowledge of that connection handle. Meaning that the initial request of disconnecting a device should be treated as done. With this change we allow the function to proceed, following the success path, which correctly calls `mgmt_device_disconnected` and ensures a consistent state. Link: https://github.com/bluez/bluez/issues/1226 Fixes: 182ee45da083 ("Bluetooth: hci_sync: Rework hci_suspend_notifier") Signed-off-by: Ludovico de Nittis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fe7cdd67ad2a..6c67dfa139e2 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2703,7 +2703,7 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) if (!conn) goto unlock; - if (status) { + if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) { mgmt_disconnect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); -- cgit v1.2.3 From b7fafbc499b5ee164018eb0eefe9027f5a6aaad2 Mon Sep 17 00:00:00 2001 From: Ludovico de Nittis Date: Tue, 12 Aug 2025 17:55:27 +0200 Subject: Bluetooth: hci_event: Mark connection as closed during suspend disconnect When suspending, the disconnect command for an active Bluetooth connection could be issued, but the corresponding `HCI_EV_DISCONN_COMPLETE` event might not be received before the system completes the suspend process. This can lead to an inconsistent state. On resume, the controller may auto-accept reconnections from the same device (due to suspend event filters), but these new connections are rejected by the kernel which still has connection objects from before suspend. Resulting in errors like: ``` kernel: Bluetooth: hci0: ACL packet for unknown connection handle 1 kernel: Bluetooth: hci0: Ignoring HCI_Connection_Complete for existing connection ``` This is a btmon snippet that shows the issue: ``` < HCI Command: Disconnect (0x01|0x0006) plen 3 Handle: 1 Address: 78:20:A5:4A:DF:28 (Nintendo Co.,Ltd) Reason: Remote User Terminated Connection (0x13) > HCI Event: Command Status (0x0f) plen 4 Disconnect (0x01|0x0006) ncmd 2 Status: Success (0x00) [...] // Host suspends with the event filter set for the device // On resume, the device tries to reconnect with a new handle > HCI Event: Connect Complete (0x03) plen 11 Status: Success (0x00) Handle: 2 Address: 78:20:A5:4A:DF:28 (Nintendo Co.,Ltd) // Kernel ignores this event because there is an existing connection with // handle 1 ``` By explicitly setting the connection state to BT_CLOSED we can ensure a consistent state, even if we don't receive the disconnect complete event in time. Link: https://github.com/bluez/bluez/issues/1226 Fixes: 182ee45da083 ("Bluetooth: hci_sync: Rework hci_suspend_notifier") Signed-off-by: Ludovico de Nittis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6c67dfa139e2..ce0ff06f2f73 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2718,6 +2718,12 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) goto done; } + /* During suspend, mark connection as closed immediately + * since we might not receive HCI_EV_DISCONN_COMPLETE + */ + if (hdev->suspended) + conn->state = BT_CLOSED; + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags); if (conn->type == ACL_LINK) { -- cgit v1.2.3 From 15bf2c6391bafb14a3020d06ec0761bce0803463 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 20 Aug 2025 17:04:00 -0400 Subject: Bluetooth: hci_event: Detect if HCI_EV_NUM_COMP_PKTS is unbalanced This attempts to detect if HCI_EV_NUM_COMP_PKTS contain an unbalanced (more than currently considered outstanding) number of packets otherwise it could cause the hcon->sent to underflow and loop around breaking the tracking of the outstanding packets pending acknowledgment. Fixes: f42809185896 ("Bluetooth: Simplify num_comp_pkts_evt function") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ce0ff06f2f73..904bcff4f4ca 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4404,7 +4404,17 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, if (!conn) continue; - conn->sent -= count; + /* Check if there is really enough packets outstanding before + * attempting to decrease the sent counter otherwise it could + * underflow.. + */ + if (conn->sent >= count) { + conn->sent -= count; + } else { + bt_dev_warn(hdev, "hcon %p sent %u < count %u", + conn, conn->sent, count); + conn->sent = 0; + } for (i = 0; i < count; ++i) hci_conn_tx_dequeue(conn); -- cgit v1.2.3 From 55b9551fcdf6a2fe7f3422918d5697b56794da72 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 20 Aug 2025 10:16:17 +0800 Subject: Bluetooth: hci_event: Disconnect device when BIG sync is lost When a BIG sync is lost, the device should be set to "disconnected". This ensures symmetry with the ISO path setup, where the device is marked as "connected" once the path is established. Without this change, the device state remains inconsistent and may lead to a memory leak. Fixes: b2a5f2e1c127 ("Bluetooth: hci_event: Add support for handling LE BIG Sync Lost event") Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 5 +++++ net/bluetooth/mgmt.c | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 904bcff4f4ca..7a2174851857 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7024,6 +7024,7 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, { struct hci_evt_le_big_sync_lost *ev = data; struct hci_conn *bis, *conn; + bool mgmt_conn; bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle); @@ -7042,6 +7043,10 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle, BT_CONNECTED, HCI_ROLE_SLAVE))) { + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &bis->flags); + mgmt_device_disconnected(hdev, &bis->dst, bis->type, bis->dst_type, + ev->reason, mgmt_conn); + clear_bit(HCI_CONN_BIG_SYNC, &bis->flags); hci_disconn_cfm(bis, ev->reason); hci_conn_del(bis); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3166f5fb876b..90e37ff2c85d 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -9705,7 +9705,9 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, if (!mgmt_connected) return; - if (link_type != ACL_LINK && link_type != LE_LINK) + if (link_type != ACL_LINK && + link_type != LE_LINK && + link_type != BIS_LINK) return; bacpy(&ev.addr.bdaddr, bdaddr); -- cgit v1.2.3 From 6bbd0d3f0c23fc53c17409dd7476f38ae0ff0cd9 Mon Sep 17 00:00:00 2001 From: Pavel Shpakovskiy Date: Fri, 22 Aug 2025 12:20:55 +0300 Subject: Bluetooth: hci_sync: fix set_local_name race condition Function set_name_sync() uses hdev->dev_name field to send HCI_OP_WRITE_LOCAL_NAME command, but copying from data to hdev->dev_name is called after mgmt cmd was queued, so it is possible that function set_name_sync() will read old name value. This change adds name as a parameter for function hci_update_name_sync() to avoid race condition. Fixes: 6f6ff38a1e14 ("Bluetooth: hci_sync: Convert MGMT_OP_SET_LOCAL_NAME") Signed-off-by: Pavel Shpakovskiy Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 2 +- net/bluetooth/hci_sync.c | 6 +++--- net/bluetooth/mgmt.c | 5 ++++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 5224f57f6af2..e352a4e0ef8d 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -93,7 +93,7 @@ int hci_update_class_sync(struct hci_dev *hdev); int hci_update_eir_sync(struct hci_dev *hdev); int hci_update_class_sync(struct hci_dev *hdev); -int hci_update_name_sync(struct hci_dev *hdev); +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name); int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 31d72b9683ef..b6f888d8354e 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3481,13 +3481,13 @@ int hci_update_scan_sync(struct hci_dev *hdev) return hci_write_scan_enable_sync(hdev, scan); } -int hci_update_name_sync(struct hci_dev *hdev) +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name) { struct hci_cp_write_local_name cp; memset(&cp, 0, sizeof(cp)); - memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); + memcpy(cp.name, name, sizeof(cp.name)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp, @@ -3540,7 +3540,7 @@ int hci_powered_update_sync(struct hci_dev *hdev) hci_write_fast_connectable_sync(hdev, false); hci_update_scan_sync(hdev); hci_update_class_sync(hdev); - hci_update_name_sync(hdev); + hci_update_name_sync(hdev, hdev->dev_name); hci_update_eir_sync(hdev); } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 90e37ff2c85d..50634ef5c8b7 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3892,8 +3892,11 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) static int set_name_sync(struct hci_dev *hdev, void *data) { + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_set_local_name *cp = cmd->param; + if (lmp_bredr_capable(hdev)) { - hci_update_name_sync(hdev); + hci_update_name_sync(hdev, cp->name); hci_update_eir_sync(hdev); } -- cgit v1.2.3 From abadf0ff63be488dc502ecfc9f622929a21b7117 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 21 Aug 2025 03:03:46 +0000 Subject: page_pool: fix incorrect mp_ops error handling Minor fix to the memory provider error handling, we should be jumping to free_ptr_ring in this error case rather than returning directly. Found by code-inspection. Cc: skhawaja@google.com Fixes: b400f4b87430 ("page_pool: Set `dma_sync` to false for devmem memory provider") Signed-off-by: Mina Almasry Reviewed-by: Samiullah Khawaja Link: https://patch.msgid.link/20250821030349.705244-1-almasrymina@google.com Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 343a6cac21e3..ba70569bd4b0 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -287,8 +287,10 @@ static int page_pool_init(struct page_pool *pool, } if (pool->mp_ops) { - if (!pool->dma_map || !pool->dma_sync) - return -EOPNOTSUPP; + if (!pool->dma_map || !pool->dma_sync) { + err = -EOPNOTSUPP; + goto free_ptr_ring; + } if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { err = -EFAULT; -- cgit v1.2.3 From a64494aafc56939564e3e9e57f99df5c27204e04 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 21 Aug 2025 11:55:28 +0530 Subject: Octeontx2-vf: Fix max packet length errors Once driver submits the packets to the hardware, each packet traverse through multiple transmit levels in the following order: SMQ -> TL4 -> TL3 -> TL2 -> TL1 The SMQ supports configurable minimum and maximum packet sizes. It enters to a hang state, if driver submits packets with out of bound lengths. To avoid the same, implement packet length validation before submitting packets to the hardware. Increment tx_dropped counter on failure. Fixes: 3184fb5ba96e ("octeontx2-vf: Virtual function driver support") Fixes: 22f858796758 ("octeontx2-pf: Add basic net_device_ops") Fixes: 3ca6c4c882a7 ("octeontx2-pf: Add packet transmission support") Signed-off-by: Hariprasad Kelam Link: https://patch.msgid.link/20250821062528.1697992-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 4 +++- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h | 1 + drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 3 +++ drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c | 10 ++++++++++ drivers/net/ethernet/marvell/octeontx2/nic/rep.c | 13 ++++++++++++- drivers/net/ethernet/marvell/octeontx2/nic/rep.h | 1 + 6 files changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index f674729124e6..aff17c37ddde 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -124,7 +124,9 @@ void otx2_get_dev_stats(struct otx2_nic *pfvf) dev_stats->rx_ucast_frames; dev_stats->tx_bytes = OTX2_GET_TX_STATS(TX_OCTS); - dev_stats->tx_drops = OTX2_GET_TX_STATS(TX_DROP); + dev_stats->tx_drops = OTX2_GET_TX_STATS(TX_DROP) + + (unsigned long)atomic_long_read(&dev_stats->tx_discards); + dev_stats->tx_bcast_frames = OTX2_GET_TX_STATS(TX_BCAST); dev_stats->tx_mcast_frames = OTX2_GET_TX_STATS(TX_MCAST); dev_stats->tx_ucast_frames = OTX2_GET_TX_STATS(TX_UCAST); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index e3765b73c434..1c8a3c078a64 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -153,6 +153,7 @@ struct otx2_dev_stats { u64 tx_bcast_frames; u64 tx_mcast_frames; u64 tx_drops; + atomic_long_t tx_discards; }; /* Driver counted stats */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index b23585c5e5c2..5027fae0aa77 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -2220,6 +2220,7 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev) { struct otx2_nic *pf = netdev_priv(netdev); int qidx = skb_get_queue_mapping(skb); + struct otx2_dev_stats *dev_stats; struct otx2_snd_queue *sq; struct netdev_queue *txq; int sq_idx; @@ -2232,6 +2233,8 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev) /* Check for minimum and maximum packet length */ if (skb->len <= ETH_HLEN || (!skb_shinfo(skb)->gso_size && skb->len > pf->tx_max_pktlen)) { + dev_stats = &pf->hw.dev_stats; + atomic_long_inc(&dev_stats->tx_discards); dev_kfree_skb(skb); return NETDEV_TX_OK; } diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 5589fccd370b..7ebb6e656884 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -417,9 +417,19 @@ static netdev_tx_t otx2vf_xmit(struct sk_buff *skb, struct net_device *netdev) { struct otx2_nic *vf = netdev_priv(netdev); int qidx = skb_get_queue_mapping(skb); + struct otx2_dev_stats *dev_stats; struct otx2_snd_queue *sq; struct netdev_queue *txq; + /* Check for minimum and maximum packet length */ + if (skb->len <= ETH_HLEN || + (!skb_shinfo(skb)->gso_size && skb->len > vf->tx_max_pktlen)) { + dev_stats = &vf->hw.dev_stats; + atomic_long_inc(&dev_stats->tx_discards); + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + sq = &vf->qset.sq[qidx]; txq = netdev_get_tx_queue(netdev, qidx); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c index 25af98034e2e..b476733a0234 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c @@ -371,7 +371,8 @@ static void rvu_rep_get_stats(struct work_struct *work) stats->rx_mcast_frames = rsp->rx.mcast; stats->tx_bytes = rsp->tx.octs; stats->tx_frames = rsp->tx.ucast + rsp->tx.bcast + rsp->tx.mcast; - stats->tx_drops = rsp->tx.drop; + stats->tx_drops = rsp->tx.drop + + (unsigned long)atomic_long_read(&stats->tx_discards); exit: mutex_unlock(&priv->mbox.lock); } @@ -418,6 +419,16 @@ static netdev_tx_t rvu_rep_xmit(struct sk_buff *skb, struct net_device *dev) struct otx2_nic *pf = rep->mdev; struct otx2_snd_queue *sq; struct netdev_queue *txq; + struct rep_stats *stats; + + /* Check for minimum and maximum packet length */ + if (skb->len <= ETH_HLEN || + (!skb_shinfo(skb)->gso_size && skb->len > pf->tx_max_pktlen)) { + stats = &rep->stats; + atomic_long_inc(&stats->tx_discards); + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } sq = &pf->qset.sq[rep->rep_id]; txq = netdev_get_tx_queue(dev, 0); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.h b/drivers/net/ethernet/marvell/octeontx2/nic/rep.h index 38446b3e4f13..5bc9e2c7d800 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.h @@ -27,6 +27,7 @@ struct rep_stats { u64 tx_bytes; u64 tx_frames; u64 tx_drops; + atomic_long_t tx_discards; }; struct rep_dev { -- cgit v1.2.3 From ec79003c5f9d2c7f9576fc69b8dbda80305cbe3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 21 Aug 2025 02:18:24 +0000 Subject: atm: atmtcp: Prevent arbitrary write in atmtcp_recv_control(). syzbot reported the splat below. [0] When atmtcp_v_open() or atmtcp_v_close() is called via connect() or close(), atmtcp_send_control() is called to send an in-kernel special message. The message has ATMTCP_HDR_MAGIC in atmtcp_control.hdr.length. Also, a pointer of struct atm_vcc is set to atmtcp_control.vcc. The notable thing is struct atmtcp_control is uAPI but has a space for an in-kernel pointer. struct atmtcp_control { struct atmtcp_hdr hdr; /* must be first */ ... atm_kptr_t vcc; /* both directions */ ... } __ATM_API_ALIGN; typedef struct { unsigned char _[8]; } __ATM_API_ALIGN atm_kptr_t; The special message is processed in atmtcp_recv_control() called from atmtcp_c_send(). atmtcp_c_send() is vcc->dev->ops->send() and called from 2 paths: 1. .ndo_start_xmit() (vcc->send() == atm_send_aal0()) 2. vcc_sendmsg() The problem is sendmsg() does not validate the message length and userspace can abuse atmtcp_recv_control() to overwrite any kptr by atmtcp_control. Let's add a new ->pre_send() hook to validate messages from sendmsg(). [0]: Oops: general protection fault, probably for non-canonical address 0xdffffc00200000ab: 0000 [#1] SMP KASAN PTI KASAN: probably user-memory-access in range [0x0000000100000558-0x000000010000055f] CPU: 0 UID: 0 PID: 5865 Comm: syz-executor331 Not tainted 6.17.0-rc1-syzkaller-00215-gbab3ce404553 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:atmtcp_recv_control drivers/atm/atmtcp.c:93 [inline] RIP: 0010:atmtcp_c_send+0x1da/0x950 drivers/atm/atmtcp.c:297 Code: 4d 8d 75 1a 4c 89 f0 48 c1 e8 03 42 0f b6 04 20 84 c0 0f 85 15 06 00 00 41 0f b7 1e 4d 8d b7 60 05 00 00 4c 89 f0 48 c1 e8 03 <42> 0f b6 04 20 84 c0 0f 85 13 06 00 00 66 41 89 1e 4d 8d 75 1c 4c RSP: 0018:ffffc90003f5f810 EFLAGS: 00010203 RAX: 00000000200000ab RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88802a510000 RSI: 00000000ffffffff RDI: ffff888030a6068c RBP: ffff88802699fb40 R08: ffff888030a606eb R09: 1ffff1100614c0dd R10: dffffc0000000000 R11: ffffffff8718fc40 R12: dffffc0000000000 R13: ffff888030a60680 R14: 000000010000055f R15: 00000000ffffffff FS: 00007f8d7e9236c0(0000) GS:ffff888125c1c000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000045ad50 CR3: 0000000075bde000 CR4: 00000000003526f0 Call Trace: vcc_sendmsg+0xa10/0xc60 net/atm/common.c:645 sock_sendmsg_nosec net/socket.c:714 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:729 ____sys_sendmsg+0x505/0x830 net/socket.c:2614 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2668 __sys_sendmsg net/socket.c:2700 [inline] __do_sys_sendmsg net/socket.c:2705 [inline] __se_sys_sendmsg net/socket.c:2703 [inline] __x64_sys_sendmsg+0x19b/0x260 net/socket.c:2703 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f8d7e96a4a9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f8d7e923198 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f8d7e9f4308 RCX: 00007f8d7e96a4a9 RDX: 0000000000000000 RSI: 0000200000000240 RDI: 0000000000000005 RBP: 00007f8d7e9f4300 R08: 65732f636f72702f R09: 65732f636f72702f R10: 65732f636f72702f R11: 0000000000000246 R12: 00007f8d7e9c10ac R13: 00007f8d7e9231a0 R14: 0000200000000200 R15: 0000200000000250 Modules linked in: Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+1741b56d54536f4ec349@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68a6767c.050a0220.3d78fd.0011.GAE@google.com/ Tested-by: syzbot+1741b56d54536f4ec349@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250821021901.2814721-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- drivers/atm/atmtcp.c | 17 ++++++++++++++--- include/linux/atmdev.h | 1 + net/atm/common.c | 15 ++++++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index eeae160c898d..fa3c76a2b49d 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -279,6 +279,19 @@ static struct atm_vcc *find_vcc(struct atm_dev *dev, short vpi, int vci) return NULL; } +static int atmtcp_c_pre_send(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct atmtcp_hdr *hdr; + + if (skb->len < sizeof(struct atmtcp_hdr)) + return -EINVAL; + + hdr = (struct atmtcp_hdr *)skb->data; + if (hdr->length == ATMTCP_HDR_MAGIC) + return -EINVAL; + + return 0; +} static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb) { @@ -288,9 +301,6 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb) struct sk_buff *new_skb; int result = 0; - if (skb->len < sizeof(struct atmtcp_hdr)) - goto done; - dev = vcc->dev_data; hdr = (struct atmtcp_hdr *) skb->data; if (hdr->length == ATMTCP_HDR_MAGIC) { @@ -347,6 +357,7 @@ static const struct atmdev_ops atmtcp_v_dev_ops = { static const struct atmdev_ops atmtcp_c_dev_ops = { .close = atmtcp_c_close, + .pre_send = atmtcp_c_pre_send, .send = atmtcp_c_send }; diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 45f2f278b50a..70807c679f1a 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -185,6 +185,7 @@ struct atmdev_ops { /* only send is required */ int (*compat_ioctl)(struct atm_dev *dev,unsigned int cmd, void __user *arg); #endif + int (*pre_send)(struct atm_vcc *vcc, struct sk_buff *skb); int (*send)(struct atm_vcc *vcc,struct sk_buff *skb); int (*send_bh)(struct atm_vcc *vcc, struct sk_buff *skb); int (*send_oam)(struct atm_vcc *vcc,void *cell,int flags); diff --git a/net/atm/common.c b/net/atm/common.c index d7f7976ea13a..881c7f259dbd 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -635,18 +635,27 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) skb->dev = NULL; /* for paths shared with net_device interfaces */ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { - atm_return_tx(vcc, skb); - kfree_skb(skb); error = -EFAULT; - goto out; + goto free_skb; } if (eff != size) memset(skb->data + size, 0, eff-size); + + if (vcc->dev->ops->pre_send) { + error = vcc->dev->ops->pre_send(vcc, skb); + if (error) + goto free_skb; + } + error = vcc->dev->ops->send(vcc, skb); error = error ? error : size; out: release_sock(sk); return error; +free_skb: + atm_return_tx(vcc, skb); + kfree_skb(skb); + goto out; } __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) -- cgit v1.2.3 From 60dfe2434eed13082f26eb7409665dfafb38fa51 Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Tue, 24 Jun 2025 07:26:40 -0700 Subject: ice: fix NULL pointer dereference in ice_unplug_aux_dev() on reset Issuing a reset when the driver is loaded without RDMA support, will results in a crash as it attempts to remove RDMA's non-existent auxbus device: echo 1 > /sys/class/net//device/reset BUG: kernel NULL pointer dereference, address: 0000000000000008 ... RIP: 0010:ice_unplug_aux_dev+0x29/0x70 [ice] ... Call Trace: ice_prepare_for_reset+0x77/0x260 [ice] pci_dev_save_and_disable+0x2c/0x70 pci_reset_function+0x88/0x130 reset_store+0x5a/0xa0 kernfs_fop_write_iter+0x15e/0x210 vfs_write+0x273/0x520 ksys_write+0x6b/0xe0 do_syscall_64+0x79/0x3b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e ice_unplug_aux_dev() checks pf->cdev_info->adev for NULL pointer, but pf->cdev_info will also be NULL, leading to the deref in the trace above. Introduce a flag to be set when the creation of the auxbus device is successful, to avoid multiple NULL pointer checks in ice_unplug_aux_dev(). Fixes: c24a65b6a27c7 ("iidc/ice/irdma: Update IDC to support multiple consumers") Signed-off-by: Emil Tantilov Reviewed-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_idc.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2098f00b3cd3..8a8a01a4bb40 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -510,6 +510,7 @@ enum ice_pf_flags { ICE_FLAG_LINK_LENIENT_MODE_ENA, ICE_FLAG_PLUG_AUX_DEV, ICE_FLAG_UNPLUG_AUX_DEV, + ICE_FLAG_AUX_DEV_CREATED, ICE_FLAG_MTU_CHANGED, ICE_FLAG_GNSS, /* GNSS successfully initialized */ ICE_FLAG_DPLL, /* SyncE/PTP dplls initialized */ diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index 6ab53e430f91..420d45c2558b 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -336,6 +336,7 @@ int ice_plug_aux_dev(struct ice_pf *pf) mutex_lock(&pf->adev_mutex); cdev->adev = adev; mutex_unlock(&pf->adev_mutex); + set_bit(ICE_FLAG_AUX_DEV_CREATED, pf->flags); return 0; } @@ -347,15 +348,16 @@ void ice_unplug_aux_dev(struct ice_pf *pf) { struct auxiliary_device *adev; + if (!test_and_clear_bit(ICE_FLAG_AUX_DEV_CREATED, pf->flags)) + return; + mutex_lock(&pf->adev_mutex); adev = pf->cdev_info->adev; pf->cdev_info->adev = NULL; mutex_unlock(&pf->adev_mutex); - if (adev) { - auxiliary_device_delete(adev); - auxiliary_device_uninit(adev); - } + auxiliary_device_delete(adev); + auxiliary_device_uninit(adev); } /** -- cgit v1.2.3 From 86aae43f21cf784c1d7f6a9af93e5116b0f232ab Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 17 Jul 2025 09:57:09 -0700 Subject: ice: don't leave device non-functional if Tx scheduler config fails The ice_cfg_tx_topo function attempts to apply Tx scheduler topology configuration based on NVM parameters, selecting either a 5 or 9 layer topology. As part of this flow, the driver acquires the "Global Configuration Lock", which is a hardware resource associated with programming the DDP package to the device. This "lock" is implemented by firmware as a way to guarantee that only one PF can program the DDP for a device. Unlike a traditional lock, once a PF has acquired this lock, no other PF will be able to acquire it again (including that PF) until a CORER of the device. Future requests to acquire the lock report that global configuration has already completed. The following flow is used to program the Tx topology: * Read the DDP package for scheduler configuration data * Acquire the global configuration lock * Program Tx scheduler topology according to DDP package data * Trigger a CORER which clears the global configuration lock This is followed by the flow for programming the DDP package: * Acquire the global configuration lock (again) * Download the DDP package to the device * Release the global configuration lock. However, if configuration of the Tx topology fails, (i.e. ice_get_set_tx_topo returns an error code), the driver exits ice_cfg_tx_topo() immediately, and fails to trigger CORER. While the global configuration lock is held, the firmware rejects most AdminQ commands, as it is waiting for the DDP package download (or Tx scheduler topology programming) to occur. The current driver flows assume that the global configuration lock has been reset by CORER after programming the Tx topology. Thus, the same PF attempts to acquire the global lock again, and fails. This results in the driver reporting "an unknown error occurred when loading the DDP package". It then attempts to enter safe mode, but ultimately fails to finish ice_probe() since nearly all AdminQ command report error codes, and the driver stops loading the device at some point during its initialization. The only currently known way that ice_get_set_tx_topo() can fail is with certain older DDP packages which contain invalid topology configuration, on firmware versions which strictly validate this data. The most recent releases of the DDP have resolved the invalid data. However, it is still poor practice to essentially brick the device, and prevent access to the device even through safe mode or recovery mode. It is also plausible that this command could fail for some other reason in the future. We cannot simply release the global lock after a failed call to ice_get_set_tx_topo(). Releasing the lock indicates to firmware that global configuration (downloading of the DDP) has completed. Future attempts by this or other PFs to load the DDP will fail with a report that the DDP package has already been downloaded. Then, PFs will enter safe mode as they realize that the package on the device does not meet the minimum version requirement to load. The reported error messages are confusing, as they indicate the version of the default "safe mode" package in the NVM, rather than the version of the file loaded from /lib/firmware. Instead, we need to trigger CORER to clear global configuration. This is the lowest level of hardware reset which clears the global configuration lock and related state. It also clears any already downloaded DDP. Crucially, it does *not* clear the Tx scheduler topology configuration. Refactor ice_cfg_tx_topo() to always trigger a CORER after acquiring the global lock, regardless of success or failure of the topology configuration. We need to re-initialize the HW structure when we trigger the CORER. Thus, it makes sense for this to be the responsibility of ice_cfg_tx_topo() rather than its caller, ice_init_tx_topology(). This avoids needless re-initialization in cases where we don't attempt to update the Tx scheduler topology, such as if it has already been programmed. There is one catch: failure to re-initialize the HW struct should stop ice_probe(). If this function fails, we won't have a valid HW structure and cannot ensure the device is functioning properly. To handle this, ensure ice_cfg_tx_topo() returns a limited set of error codes. Set aside one specifically, -ENODEV, to indicate that the ice_init_tx_topology() should fail and stop probe. Other error codes indicate failure to apply the Tx scheduler topology. This is treated as a non-fatal error, with an informational message informing the system administrator that the updated Tx topology did not apply. This allows the device to load and function with the default Tx scheduler topology, rather than failing to load entirely. Note that this use of CORER will not result in loops with future PFs attempting to also load the invalid Tx topology configuration. The first PF will acquire the global configuration lock as part of programming the DDP. Each PF after this will attempt to acquire the global lock as part of programming the Tx topology, and will fail with the indication from firmware that global configuration is already complete. Tx scheduler topology configuration is only performed during driver init (probe or devlink reload) and not during cleanup for a CORER that happens after probe completes. Fixes: 91427e6d9030 ("ice: Support 5 layer topology") Signed-off-by: Jacob Keller Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ddp.c | 44 ++++++++++++++++++++++--------- drivers/net/ethernet/intel/ice/ice_main.c | 16 +++++++---- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index e2a036ce76ca..3b2d9c436979 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -2377,7 +2377,13 @@ ice_get_set_tx_topo(struct ice_hw *hw, u8 *buf, u16 buf_size, * The function will apply the new Tx topology from the package buffer * if available. * - * Return: zero when update was successful, negative values otherwise. + * Return: + * * 0 - Successfully applied topology configuration. + * * -EBUSY - Failed to acquire global configuration lock. + * * -EEXIST - Topology configuration has already been applied. + * * -EIO - Unable to apply topology configuration. + * * -ENODEV - Failed to re-initialize device after applying configuration. + * * Other negative error codes indicate unexpected failures. */ int ice_cfg_tx_topo(struct ice_hw *hw, const void *buf, u32 len) { @@ -2410,7 +2416,7 @@ int ice_cfg_tx_topo(struct ice_hw *hw, const void *buf, u32 len) if (status) { ice_debug(hw, ICE_DBG_INIT, "Get current topology is failed\n"); - return status; + return -EIO; } /* Is default topology already applied ? */ @@ -2497,31 +2503,45 @@ update_topo: ICE_GLOBAL_CFG_LOCK_TIMEOUT); if (status) { ice_debug(hw, ICE_DBG_INIT, "Failed to acquire global lock\n"); - return status; + return -EBUSY; } /* Check if reset was triggered already. */ reg = rd32(hw, GLGEN_RSTAT); if (reg & GLGEN_RSTAT_DEVSTATE_M) { - /* Reset is in progress, re-init the HW again */ ice_debug(hw, ICE_DBG_INIT, "Reset is in progress. Layer topology might be applied already\n"); ice_check_reset(hw); - return 0; + /* Reset is in progress, re-init the HW again */ + goto reinit_hw; } /* Set new topology */ status = ice_get_set_tx_topo(hw, new_topo, size, NULL, NULL, true); if (status) { - ice_debug(hw, ICE_DBG_INIT, "Failed setting Tx topology\n"); - return status; + ice_debug(hw, ICE_DBG_INIT, "Failed to set Tx topology, status %pe\n", + ERR_PTR(status)); + /* only report -EIO here as the caller checks the error value + * and reports an informational error message informing that + * the driver failed to program Tx topology. + */ + status = -EIO; } - /* New topology is updated, delay 1 second before issuing the CORER */ + /* Even if Tx topology config failed, we need to CORE reset here to + * clear the global configuration lock. Delay 1 second to allow + * hardware to settle then issue a CORER + */ msleep(1000); ice_reset(hw, ICE_RESET_CORER); - /* CORER will clear the global lock, so no explicit call - * required for release. - */ + ice_check_reset(hw); + +reinit_hw: + /* Since we triggered a CORER, re-initialize hardware */ + ice_deinit_hw(hw); + if (ice_init_hw(hw)) { + ice_debug(hw, ICE_DBG_INIT, "Failed to re-init hardware after setting Tx topology\n"); + return -ENODEV; + } - return 0; + return status; } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 8e0b06c1e02b..cae992d8f03c 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4536,17 +4536,23 @@ ice_init_tx_topology(struct ice_hw *hw, const struct firmware *firmware) dev_info(dev, "Tx scheduling layers switching feature disabled\n"); else dev_info(dev, "Tx scheduling layers switching feature enabled\n"); - /* if there was a change in topology ice_cfg_tx_topo triggered - * a CORER and we need to re-init hw + return 0; + } else if (err == -ENODEV) { + /* If we failed to re-initialize the device, we can no longer + * continue loading. */ - ice_deinit_hw(hw); - err = ice_init_hw(hw); - + dev_warn(dev, "Failed to initialize hardware after applying Tx scheduling configuration.\n"); return err; } else if (err == -EIO) { dev_info(dev, "DDP package does not support Tx scheduling layers switching feature - please update to the latest DDP package and try again\n"); + return 0; + } else if (err == -EEXIST) { + return 0; } + /* Do not treat this as a fatal error. */ + dev_info(dev, "Failed to apply Tx scheduling configuration, err %pe\n", + ERR_PTR(err)); return 0; } -- cgit v1.2.3 From 5c5e5b52bf05c7fe88768318c041052c5fac36b8 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 1 Aug 2025 15:27:12 -0700 Subject: ice: use fixed adapter index for E825C embedded devices The ice_adapter structure is used by the ice driver to connect multiple physical functions of a device in software. It was introduced by commit 0e2bddf9e5f9 ("ice: add ice_adapter for shared data across PFs on the same NIC") and is primarily used for PTP support, as well as for handling certain cross-PF synchronization. The original design of ice_adapter used PCI address information to determine which devices should be connected. This was extended to support E825C devices by commit fdb7f54700b1 ("ice: Initial support for E825C hardware in ice_adapter"), which used the device ID for E825C devices instead of the PCI address. Later, commit 0093cb194a75 ("ice: use DSN instead of PCI BDF for ice_adapter index") replaced the use of Bus/Device/Function addressing with use of the device serial number. E825C devices may appear in "Dual NAC" configuration which has multiple physical devices tied to the same clock source and which need to use the same ice_adapter. Unfortunately, each "NAC" has its own NVM which has its own unique Device Serial Number. Thus, use of the DSN for connecting ice_adapter does not work properly. It "worked" in the pre-production systems because the DSN was not initialized on the test NVMs and all the NACs had the same zero'd serial number. Since we cannot rely on the DSN, lets fall back to the logic in the original E825C support which used the device ID. This is safe for E825C only because of the embedded nature of the device. It isn't a discreet adapter that can be plugged into an arbitrary system. All E825C devices on a given system are connected to the same clock source and need to be configured through the same PTP clock. To make this separation clear, reserve bit 63 of the 64-bit index values as a "fixed index" indicator. Always clear this bit when using the device serial number as an index. For E825C, use a fixed value defined as the 0x579C E825C backplane device ID bitwise ORed with the fixed index indicator. This is slightly different than the original logic of just using the device ID directly. Doing so prevents a potential issue with systems where only one of the NACs is connected with an external PHY over SGMII. In that case, one NAC would have the E825C_SGMII device ID, but the other would not. Separate the determination of the full 64-bit index from the 32-bit reduction logic. Provide both ice_adapter_index() and a wrapping ice_adapter_xa_index() which handles reducing the index to a long on 32-bit systems. As before, cache the full index value in the adapter structure to warn about collisions. This fixes issues with E825C not initializing PTP on both NACs, due to failure to connect the appropriate devices to the same ice_adapter. Fixes: 0093cb194a75 ("ice: use DSN instead of PCI BDF for ice_adapter index") Signed-off-by: Jacob Keller Reviewed-by: Grzegorz Nitka Reviewed-by: Aleksandr Loktionov Reviewed-by: Przemek Kitszel Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_adapter.c | 49 +++++++++++++++++++++------- drivers/net/ethernet/intel/ice/ice_adapter.h | 4 +-- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adapter.c b/drivers/net/ethernet/intel/ice/ice_adapter.c index 9e4adc43e474..b53561c34708 100644 --- a/drivers/net/ethernet/intel/ice/ice_adapter.c +++ b/drivers/net/ethernet/intel/ice/ice_adapter.c @@ -13,16 +13,45 @@ static DEFINE_XARRAY(ice_adapters); static DEFINE_MUTEX(ice_adapters_mutex); -static unsigned long ice_adapter_index(u64 dsn) +#define ICE_ADAPTER_FIXED_INDEX BIT_ULL(63) + +#define ICE_ADAPTER_INDEX_E825C \ + (ICE_DEV_ID_E825C_BACKPLANE | ICE_ADAPTER_FIXED_INDEX) + +static u64 ice_adapter_index(struct pci_dev *pdev) { + switch (pdev->device) { + case ICE_DEV_ID_E825C_BACKPLANE: + case ICE_DEV_ID_E825C_QSFP: + case ICE_DEV_ID_E825C_SFP: + case ICE_DEV_ID_E825C_SGMII: + /* E825C devices have multiple NACs which are connected to the + * same clock source, and which must share the same + * ice_adapter structure. We can't use the serial number since + * each NAC has its own NVM generated with its own unique + * Device Serial Number. Instead, rely on the embedded nature + * of the E825C devices, and use a fixed index. This relies on + * the fact that all E825C physical functions in a given + * system are part of the same overall device. + */ + return ICE_ADAPTER_INDEX_E825C; + default: + return pci_get_dsn(pdev) & ~ICE_ADAPTER_FIXED_INDEX; + } +} + +static unsigned long ice_adapter_xa_index(struct pci_dev *pdev) +{ + u64 index = ice_adapter_index(pdev); + #if BITS_PER_LONG == 64 - return dsn; + return index; #else - return (u32)dsn ^ (u32)(dsn >> 32); + return (u32)index ^ (u32)(index >> 32); #endif } -static struct ice_adapter *ice_adapter_new(u64 dsn) +static struct ice_adapter *ice_adapter_new(struct pci_dev *pdev) { struct ice_adapter *adapter; @@ -30,7 +59,7 @@ static struct ice_adapter *ice_adapter_new(u64 dsn) if (!adapter) return NULL; - adapter->device_serial_number = dsn; + adapter->index = ice_adapter_index(pdev); spin_lock_init(&adapter->ptp_gltsyn_time_lock); spin_lock_init(&adapter->txq_ctx_lock); refcount_set(&adapter->refcount, 1); @@ -64,24 +93,23 @@ static void ice_adapter_free(struct ice_adapter *adapter) */ struct ice_adapter *ice_adapter_get(struct pci_dev *pdev) { - u64 dsn = pci_get_dsn(pdev); struct ice_adapter *adapter; unsigned long index; int err; - index = ice_adapter_index(dsn); + index = ice_adapter_xa_index(pdev); scoped_guard(mutex, &ice_adapters_mutex) { err = xa_insert(&ice_adapters, index, NULL, GFP_KERNEL); if (err == -EBUSY) { adapter = xa_load(&ice_adapters, index); refcount_inc(&adapter->refcount); - WARN_ON_ONCE(adapter->device_serial_number != dsn); + WARN_ON_ONCE(adapter->index != ice_adapter_index(pdev)); return adapter; } if (err) return ERR_PTR(err); - adapter = ice_adapter_new(dsn); + adapter = ice_adapter_new(pdev); if (!adapter) return ERR_PTR(-ENOMEM); xa_store(&ice_adapters, index, adapter, GFP_KERNEL); @@ -100,11 +128,10 @@ struct ice_adapter *ice_adapter_get(struct pci_dev *pdev) */ void ice_adapter_put(struct pci_dev *pdev) { - u64 dsn = pci_get_dsn(pdev); struct ice_adapter *adapter; unsigned long index; - index = ice_adapter_index(dsn); + index = ice_adapter_xa_index(pdev); scoped_guard(mutex, &ice_adapters_mutex) { adapter = xa_load(&ice_adapters, index); if (WARN_ON(!adapter)) diff --git a/drivers/net/ethernet/intel/ice/ice_adapter.h b/drivers/net/ethernet/intel/ice/ice_adapter.h index db66d03c9f96..e95266c7f20b 100644 --- a/drivers/net/ethernet/intel/ice/ice_adapter.h +++ b/drivers/net/ethernet/intel/ice/ice_adapter.h @@ -33,7 +33,7 @@ struct ice_port_list { * @txq_ctx_lock: Spinlock protecting access to the GLCOMM_QTX_CNTX_CTL register * @ctrl_pf: Control PF of the adapter * @ports: Ports list - * @device_serial_number: DSN cached for collision detection on 32bit systems + * @index: 64-bit index cached for collision detection on 32bit systems */ struct ice_adapter { refcount_t refcount; @@ -44,7 +44,7 @@ struct ice_adapter { struct ice_pf *ctrl_pf; struct ice_port_list ports; - u64 device_serial_number; + u64 index; }; struct ice_adapter *ice_adapter_get(struct pci_dev *pdev); -- cgit v1.2.3 From b1a0c977c6f1130f7dd125ee3db8c2435d7e3d41 Mon Sep 17 00:00:00 2001 From: Michal Kubiak Date: Fri, 8 Aug 2025 17:53:10 +0200 Subject: ice: fix incorrect counter for buffer allocation failures Currently, the driver increments `alloc_page_failed` when buffer allocation fails in `ice_clean_rx_irq()`. However, this counter is intended for page allocation failures, not buffer allocation issues. This patch corrects the counter by incrementing `alloc_buf_failed` instead, ensuring accurate statistics reporting for buffer allocation failures. Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") Reported-by: Jacob Keller Suggested-by: Paul Menzel Signed-off-by: Michal Kubiak Reviewed-by: Paul Menzel Reviewed-by: Jason Xing Reviewed-by: Aleksandr Loktionov Tested-by: Priya Singh Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 29e0088ab6b2..d2871757ec94 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1352,7 +1352,7 @@ construct_skb: skb = ice_construct_skb(rx_ring, xdp); /* exit if we failed to retrieve a buffer */ if (!skb) { - rx_ring->ring_stats->rx_stats.alloc_page_failed++; + rx_ring->ring_stats->rx_stats.alloc_buf_failed++; xdp_verdict = ICE_XDP_CONSUMED; } ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); -- cgit v1.2.3 From ed913b343dcf9f623e7436fa1a153c89b22d109b Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 31 Jul 2025 14:45:33 +0200 Subject: ixgbe: fix ixgbe_orom_civd_info struct layout The current layout of struct ixgbe_orom_civd_info causes incorrect data storage due to compiler-inserted padding. This results in issues when writing OROM data into the structure. Add the __packed attribute to ensure the structure layout matches the expected binary format without padding. Fixes: 70db0788a262 ("ixgbe: read the OROM version information") Reviewed-by: Aleksandr Loktionov Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index d74116441d1c..bfeef5b0b99d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -3125,7 +3125,7 @@ static int ixgbe_get_orom_ver_info(struct ixgbe_hw *hw, if (err) return err; - combo_ver = le32_to_cpu(civd.combo_ver); + combo_ver = get_unaligned_le32(&civd.combo_ver); orom->major = (u8)FIELD_GET(IXGBE_OROM_VER_MASK, combo_ver); orom->patch = (u8)FIELD_GET(IXGBE_OROM_VER_PATCH_MASK, combo_ver); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index d2f22d8558f8..ff8d640a50b1 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -932,7 +932,7 @@ struct ixgbe_orom_civd_info { __le32 combo_ver; /* Combo Image Version number */ u8 combo_name_len; /* Length of the unicode combo image version string, max of 32 */ __le16 combo_name[32]; /* Unicode string representing the Combo Image version */ -}; +} __packed; /* Function specific capabilities */ struct ixgbe_hw_func_caps { -- cgit v1.2.3 From d280233fc86692f495d5e08092e5422bc2f583a8 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Fri, 22 Aug 2025 16:28:05 +0530 Subject: Octeontx2-af: Fix NIX X2P calibration failures Before configuring the NIX block, the AF driver initiates the "NIX block X2P bus calibration" and verifies that NIX interfaces such as CGX and LBK are active and functioning correctly. On few silicon variants(CNF10KA and CNF10KB), X2P calibration failures have been observed on some CGX blocks that are not mapped to the NIX block. Since both NIX-mapped and non-NIX-mapped CGX blocks share the same VENDOR,DEVICE,SUBSYS_DEVID, it's not possible to skip probe based on these parameters. This patch introuduces "is_cgx_mapped_to_nix" API to detect and skip probe of non NIX mapped CGX blocks. Fixes: aba53d5dbcea ("octeontx2-af: NIX block admin queue init") Signed-off-by: Hariprasad Kelam Link: https://patch.msgid.link/20250822105805.2236528-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 7 +++++++ drivers/net/ethernet/marvell/octeontx2/af/rvu.h | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 4ff19a04b23e..0c46ba8a5adc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -1978,6 +1978,13 @@ static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_release_regions; } + if (!is_cn20k(pdev) && + !is_cgx_mapped_to_nix(pdev->subsystem_device, cgx->cgx_id)) { + dev_notice(dev, "CGX %d not mapped to NIX, skipping probe\n", + cgx->cgx_id); + goto err_release_regions; + } + cgx->lmac_count = cgx->mac_ops->get_nr_lmacs(cgx); if (!cgx->lmac_count) { dev_notice(dev, "CGX %d LMAC count is zero, skipping probe\n", cgx->cgx_id); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 7ee1fdeb5295..18c7bb39dbc7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -783,6 +783,20 @@ static inline bool is_cn10kb(struct rvu *rvu) return false; } +static inline bool is_cgx_mapped_to_nix(unsigned short id, u8 cgx_id) +{ + /* On CNF10KA and CNF10KB silicons only two CGX blocks are connected + * to NIX. + */ + if (id == PCI_SUBSYS_DEVID_CNF10K_A || id == PCI_SUBSYS_DEVID_CNF10K_B) + return cgx_id <= 1; + + return !(cgx_id && !(id == PCI_SUBSYS_DEVID_96XX || + id == PCI_SUBSYS_DEVID_98XX || + id == PCI_SUBSYS_DEVID_CN10K_A || + id == PCI_SUBSYS_DEVID_CN10K_B)); +} + static inline bool is_rvu_npc_hash_extract_en(struct rvu *rvu) { u64 npc_const3; -- cgit v1.2.3 From 97766512a9951b9fd6fc97f1b93211642bb0b220 Mon Sep 17 00:00:00 2001 From: Vladimir Riabchun Date: Fri, 22 Aug 2025 20:11:36 +0200 Subject: mISDN: hfcpci: Fix warning when deleting uninitialized timer With CONFIG_DEBUG_OBJECTS_TIMERS unloading hfcpci module leads to the following splat: [ 250.215892] ODEBUG: assert_init not available (active state 0) object: ffffffffc01a3dc0 object type: timer_list hint: 0x0 [ 250.217520] WARNING: CPU: 0 PID: 233 at lib/debugobjects.c:612 debug_print_object+0x1b6/0x2c0 [ 250.218775] Modules linked in: hfcpci(-) mISDN_core [ 250.219537] CPU: 0 UID: 0 PID: 233 Comm: rmmod Not tainted 6.17.0-rc2-g6f713187ac98 #2 PREEMPT(voluntary) [ 250.220940] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 250.222377] RIP: 0010:debug_print_object+0x1b6/0x2c0 [ 250.223131] Code: fc ff df 48 89 fa 48 c1 ea 03 80 3c 02 00 75 4f 41 56 48 8b 14 dd a0 4e 01 9f 48 89 ee 48 c7 c7 20 46 01 9f e8 cb 84d [ 250.225805] RSP: 0018:ffff888015ea7c08 EFLAGS: 00010286 [ 250.226608] RAX: 0000000000000000 RBX: 0000000000000005 RCX: ffffffff9be93a95 [ 250.227708] RDX: 1ffff1100d945138 RSI: 0000000000000008 RDI: ffff88806ca289c0 [ 250.228993] RBP: ffffffff9f014a00 R08: 0000000000000001 R09: ffffed1002bd4f39 [ 250.230043] R10: ffff888015ea79cf R11: 0000000000000001 R12: 0000000000000001 [ 250.231185] R13: ffffffff9eea0520 R14: 0000000000000000 R15: ffff888015ea7cc8 [ 250.232454] FS: 00007f3208f01540(0000) GS:ffff8880caf5a000(0000) knlGS:0000000000000000 [ 250.233851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 250.234856] CR2: 00007f32090a7421 CR3: 0000000004d63000 CR4: 00000000000006f0 [ 250.236117] Call Trace: [ 250.236599] [ 250.236967] ? trace_irq_enable.constprop.0+0xd4/0x130 [ 250.237920] debug_object_assert_init+0x1f6/0x310 [ 250.238762] ? __pfx_debug_object_assert_init+0x10/0x10 [ 250.239658] ? __lock_acquire+0xdea/0x1c70 [ 250.240369] __try_to_del_timer_sync+0x69/0x140 [ 250.241172] ? __pfx___try_to_del_timer_sync+0x10/0x10 [ 250.242058] ? __timer_delete_sync+0xc6/0x120 [ 250.242842] ? lock_acquire+0x30/0x80 [ 250.243474] ? __timer_delete_sync+0xc6/0x120 [ 250.244262] __timer_delete_sync+0x98/0x120 [ 250.245015] HFC_cleanup+0x10/0x20 [hfcpci] [ 250.245704] __do_sys_delete_module+0x348/0x510 [ 250.246461] ? __pfx___do_sys_delete_module+0x10/0x10 [ 250.247338] do_syscall_64+0xc1/0x360 [ 250.247924] entry_SYSCALL_64_after_hwframe+0x77/0x7f Fix this by initializing hfc_tl timer with DEFINE_TIMER macro. Also, use mod_timer instead of manual timeout update. Fixes: 87c5fa1bb426 ("mISDN: Add different different timer settings for hfc-pci") Fixes: 175302f6b79e ("mISDN: hfcpci: Fix use-after-free bug in hfcpci_softirq") Signed-off-by: Vladimir Riabchun Link: https://patch.msgid.link/aKiy2D_LiWpQ5kXq@vova-pc Signed-off-by: Jakub Kicinski --- drivers/isdn/hardware/mISDN/hfcpci.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c index 2b05722d4dbe..ea8a0ab47afd 100644 --- a/drivers/isdn/hardware/mISDN/hfcpci.c +++ b/drivers/isdn/hardware/mISDN/hfcpci.c @@ -39,12 +39,13 @@ #include "hfc_pci.h" +static void hfcpci_softirq(struct timer_list *unused); static const char *hfcpci_revision = "2.0"; static int HFC_cnt; static uint debug; static uint poll, tics; -static struct timer_list hfc_tl; +static DEFINE_TIMER(hfc_tl, hfcpci_softirq); static unsigned long hfc_jiffies; MODULE_AUTHOR("Karsten Keil"); @@ -2305,8 +2306,7 @@ hfcpci_softirq(struct timer_list *unused) hfc_jiffies = jiffies + 1; else hfc_jiffies += tics; - hfc_tl.expires = hfc_jiffies; - add_timer(&hfc_tl); + mod_timer(&hfc_tl, hfc_jiffies); } static int __init @@ -2332,10 +2332,8 @@ HFC_init(void) if (poll != HFCPCI_BTRANS_THRESHOLD) { printk(KERN_INFO "%s: Using alternative poll value of %d\n", __func__, poll); - timer_setup(&hfc_tl, hfcpci_softirq, 0); - hfc_tl.expires = jiffies + tics; - hfc_jiffies = hfc_tl.expires; - add_timer(&hfc_tl); + hfc_jiffies = jiffies + tics; + mod_timer(&hfc_tl, hfc_jiffies); } else tics = 0; /* indicate the use of controller's timer */ -- cgit v1.2.3 From 007a5ffadc4fd51739527f1503b7cf048f31c413 Mon Sep 17 00:00:00 2001 From: Yeounsu Moon Date: Sun, 24 Aug 2025 03:29:24 +0900 Subject: net: dlink: fix multicast stats being counted incorrectly `McstFramesRcvdOk` counts the number of received multicast packets, and it reports the value correctly. However, reading `McstFramesRcvdOk` clears the register to zero. As a result, the driver was reporting only the packets since the last read, instead of the accumulated total. Fix this by updating the multicast statistics accumulatively instaed of instantaneously. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Tested-on: D-Link DGE-550T Rev-A3 Signed-off-by: Yeounsu Moon Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250823182927.6063-3-yyyynoom@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/dlink/dl2k.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/dlink/dl2k.c b/drivers/net/ethernet/dlink/dl2k.c index cc60ee454bf9..6bbf6e5584e5 100644 --- a/drivers/net/ethernet/dlink/dl2k.c +++ b/drivers/net/ethernet/dlink/dl2k.c @@ -1099,7 +1099,7 @@ get_stats (struct net_device *dev) dev->stats.rx_bytes += dr32(OctetRcvOk); dev->stats.tx_bytes += dr32(OctetXmtOk); - dev->stats.multicast = dr32(McstFramesRcvdOk); + dev->stats.multicast += dr32(McstFramesRcvdOk); dev->stats.collisions += dr32(SingleColFrames) + dr32(MultiColFrames); -- cgit v1.2.3 From e81a7f65288c7e2cfb7e7890f648e099fd885ab3 Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Fri, 22 Aug 2025 11:13:24 +0200 Subject: net: usb: qmi_wwan: add Telit Cinterion LE910C4-WWX new compositions Add the following Telit Cinterion LE910C4-WWX new compositions: 0x1034: tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1034 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x1037: tty (diag) + tty (Telit custom) + tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 15 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1037 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x1038: tty (Telit custom) + tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 9 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1038 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Cc: stable@vger.kernel.org Signed-off-by: Fabio Porcedda Link: https://patch.msgid.link/20250822091324.39558-1-Fabio.Porcedda@telit.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index e56901bb6ebc..11352d85475a 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1355,6 +1355,9 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ {QMI_FIXED_INTF(0x2357, 0x9000, 4)}, /* TP-LINK MA260 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1031, 3)}, /* Telit LE910C1-EUX */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x1034, 2)}, /* Telit LE910C4-WWX */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x1037, 4)}, /* Telit LE910C4-WWX */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x1038, 3)}, /* Telit LE910C4-WWX */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x103a, 0)}, /* Telit LE910C4-WWX */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1050, 2)}, /* Telit FN980 */ -- cgit v1.2.3 From 882e57cbc7204662f6c5672d5b04336c1d790b03 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 25 Aug 2025 08:55:43 +0200 Subject: phy: mscc: Fix when PTP clock is register and unregister It looks like that every time when the interface was set down and up the driver was creating a new ptp clock. On top of this the function ptp_clock_unregister was never called. Therefore fix this by calling ptp_clock_register and initialize the mii_ts struct inside the probe function and call ptp_clock_unregister when driver is removed. Fixes: 7d272e63e0979d ("net: phy: mscc: timestamping and PHC support") Signed-off-by: Horatiu Vultur Reviewed-by: Vadim Fedorenko Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250825065543.2916334-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mscc/mscc.h | 4 ++++ drivers/net/phy/mscc/mscc_main.c | 4 +--- drivers/net/phy/mscc/mscc_ptp.c | 34 +++++++++++++++++++++------------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h index 58c6d47fbe04..2bfe314ef881 100644 --- a/drivers/net/phy/mscc/mscc.h +++ b/drivers/net/phy/mscc/mscc.h @@ -481,6 +481,7 @@ static inline void vsc8584_config_macsec_intr(struct phy_device *phydev) void vsc85xx_link_change_notify(struct phy_device *phydev); void vsc8584_config_ts_intr(struct phy_device *phydev); int vsc8584_ptp_init(struct phy_device *phydev); +void vsc8584_ptp_deinit(struct phy_device *phydev); int vsc8584_ptp_probe_once(struct phy_device *phydev); int vsc8584_ptp_probe(struct phy_device *phydev); irqreturn_t vsc8584_handle_ts_interrupt(struct phy_device *phydev); @@ -495,6 +496,9 @@ static inline int vsc8584_ptp_init(struct phy_device *phydev) { return 0; } +static inline void vsc8584_ptp_deinit(struct phy_device *phydev) +{ +} static inline int vsc8584_ptp_probe_once(struct phy_device *phydev) { return 0; diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index f1c9ce351ab4..24c75903f535 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -2337,9 +2337,7 @@ static int vsc85xx_probe(struct phy_device *phydev) static void vsc85xx_remove(struct phy_device *phydev) { - struct vsc8531_private *priv = phydev->priv; - - skb_queue_purge(&priv->rx_skbs_list); + vsc8584_ptp_deinit(phydev); } /* Microsemi VSC85xx PHYs */ diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c index de6c7312e8f2..72847320cb65 100644 --- a/drivers/net/phy/mscc/mscc_ptp.c +++ b/drivers/net/phy/mscc/mscc_ptp.c @@ -1298,7 +1298,6 @@ static void vsc8584_set_input_clk_configured(struct phy_device *phydev) static int __vsc8584_init_ptp(struct phy_device *phydev) { - struct vsc8531_private *vsc8531 = phydev->priv; static const u32 ltc_seq_e[] = { 0, 400000, 0, 0, 0 }; static const u8 ltc_seq_a[] = { 8, 6, 5, 4, 2 }; u32 val; @@ -1515,17 +1514,7 @@ static int __vsc8584_init_ptp(struct phy_device *phydev) vsc85xx_ts_eth_cmp1_sig(phydev); - vsc8531->mii_ts.rxtstamp = vsc85xx_rxtstamp; - vsc8531->mii_ts.txtstamp = vsc85xx_txtstamp; - vsc8531->mii_ts.hwtstamp = vsc85xx_hwtstamp; - vsc8531->mii_ts.ts_info = vsc85xx_ts_info; - phydev->mii_ts = &vsc8531->mii_ts; - - memcpy(&vsc8531->ptp->caps, &vsc85xx_clk_caps, sizeof(vsc85xx_clk_caps)); - - vsc8531->ptp->ptp_clock = ptp_clock_register(&vsc8531->ptp->caps, - &phydev->mdio.dev); - return PTR_ERR_OR_ZERO(vsc8531->ptp->ptp_clock); + return 0; } void vsc8584_config_ts_intr(struct phy_device *phydev) @@ -1552,6 +1541,16 @@ int vsc8584_ptp_init(struct phy_device *phydev) return 0; } +void vsc8584_ptp_deinit(struct phy_device *phydev) +{ + struct vsc8531_private *vsc8531 = phydev->priv; + + if (vsc8531->ptp->ptp_clock) { + ptp_clock_unregister(vsc8531->ptp->ptp_clock); + skb_queue_purge(&vsc8531->rx_skbs_list); + } +} + irqreturn_t vsc8584_handle_ts_interrupt(struct phy_device *phydev) { struct vsc8531_private *priv = phydev->priv; @@ -1612,7 +1611,16 @@ int vsc8584_ptp_probe(struct phy_device *phydev) vsc8531->ptp->phydev = phydev; - return 0; + vsc8531->mii_ts.rxtstamp = vsc85xx_rxtstamp; + vsc8531->mii_ts.txtstamp = vsc85xx_txtstamp; + vsc8531->mii_ts.hwtstamp = vsc85xx_hwtstamp; + vsc8531->mii_ts.ts_info = vsc85xx_ts_info; + phydev->mii_ts = &vsc8531->mii_ts; + + memcpy(&vsc8531->ptp->caps, &vsc85xx_clk_caps, sizeof(vsc85xx_clk_caps)); + vsc8531->ptp->ptp_clock = ptp_clock_register(&vsc8531->ptp->caps, + &phydev->mdio.dev); + return PTR_ERR_OR_ZERO(vsc8531->ptp->ptp_clock); } int vsc8584_ptp_probe_once(struct phy_device *phydev) -- cgit v1.2.3 From 26c1f55f7ec8d1a4bde8c50e4ee04e3c8c6b27e8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 25 Aug 2025 08:57:53 -0700 Subject: MAINTAINERS: retire Boris from TLS maintainers There's a steady stream of TLS changes and bugs. We need active maintainers in this area, and Boris hasn't been participating much in upstream work. Move him to CREDITS. While at it also add Dave Watson there who was the author of the initial SW implementation, AFAIU. Link: https://patch.msgid.link/20250825155753.2178045-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- CREDITS | 7 +++++++ MAINTAINERS | 1 - 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index a357f9cbb05d..a687c3c35c4c 100644 --- a/CREDITS +++ b/CREDITS @@ -3222,6 +3222,10 @@ D: AIC5800 IEEE 1394, RAW I/O on 1394 D: Starter of Linux1394 effort S: ask per mail for current address +N: Boris Pismenny +E: borisp@mellanox.com +D: Kernel TLS implementation and offload support. + N: Nicolas Pitre E: nico@fluxnic.net D: StrongARM SA1100 support integrator & hacker @@ -4168,6 +4172,9 @@ S: 1513 Brewster Dr. S: Carrollton, TX 75010 S: USA +N: Dave Watson +D: Kernel TLS implementation. + N: Tim Waugh E: tim@cyberelk.net D: Co-architect of the parallel-port sharing system diff --git a/MAINTAINERS b/MAINTAINERS index 2720544cd91f..1897d8b45df4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17818,7 +17818,6 @@ F: net/ipv6/syncookies.c F: net/ipv6/tcp*.c NETWORKING [TLS] -M: Boris Pismenny M: John Fastabend M: Jakub Kicinski L: netdev@vger.kernel.org -- cgit v1.2.3 From 16c8a3a67ec799fc731919e3e51be9af6cdf541d Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 25 Aug 2025 13:21:34 -0400 Subject: net: macb: Fix offset error in gem_update_stats hw_stats now has only one variable for tx_octets/rx_octets, so we should only increment p once, not twice. This would cause the statistics to be reported under the wrong categories in `ethtool -S --all-groups` (which uses hw_stats) but not `ethtool -S` (which uses ethtool_stats). Signed-off-by: Sean Anderson Fixes: f6af690a295a ("net: cadence: macb: Report standard stats") Link: https://patch.msgid.link/20250825172134.681861-1-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index b29c3beae0b2..106885451147 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3090,7 +3090,7 @@ static void gem_update_stats(struct macb *bp) /* Add GEM_OCTTXH, GEM_OCTRXH */ val = bp->macb_reg_readl(bp, offset + 4); bp->ethtool_stats[i] += ((u64)val) << 32; - *(p++) += ((u64)val) << 32; + *p += ((u64)val) << 32; } } -- cgit v1.2.3 From d9b0ca1334d8a9a03bef45e95825564c56ca3367 Mon Sep 17 00:00:00 2001 From: Boon Khai Ng Date: Mon, 25 Aug 2025 15:13:21 +0800 Subject: MAINTAINERS: Update maintainer information for Altera Triple Speed Ethernet Driver The previous maintainer, Joyce Ooi, is no longer with the company, and her email is no longer reachable. As a result, the maintainer information for the Altera Triple Speed Ethernet Driver has been updated. Changes: - Replaced Joyce Ooi's email with Boon Khai Ng's email address. - Kept the component's status as "Maintained". Signed-off-by: Boon Khai Ng Link: https://patch.msgid.link/20250825071321.30131-1-boon.khai.ng@altera.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1897d8b45df4..c5b47955d2a6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -937,7 +937,7 @@ S: Maintained F: drivers/gpio/gpio-altera.c ALTERA TRIPLE SPEED ETHERNET DRIVER -M: Joyce Ooi +M: Boon Khai Ng L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/altera/ -- cgit v1.2.3 From 2747328ba2714f1a7454208dbbc1dc0631990b4a Mon Sep 17 00:00:00 2001 From: Sreekanth Reddy Date: Mon, 25 Aug 2025 10:59:25 -0700 Subject: bnxt_en: Fix memory corruption when FW resources change during ifdown bnxt_set_dflt_rings() assumes that it is always called before any TC has been created. So it doesn't take bp->num_tc into account and assumes that it is always 0 or 1. In the FW resource or capability change scenario, the FW will return flags in bnxt_hwrm_if_change() that will cause the driver to reinitialize and call bnxt_cancel_reservations(). This will lead to bnxt_init_dflt_ring_mode() calling bnxt_set_dflt_rings() and bp->num_tc may be greater than 1. This will cause bp->tx_ring[] to be sized too small and cause memory corruption in bnxt_alloc_cp_rings(). Fix it by properly scaling the TX rings by bp->num_tc in the code paths mentioned above. Add 2 helper functions to determine bp->tx_nr_rings and bp->tx_nr_rings_per_tc. Fixes: ec5d31e3c15d ("bnxt_en: Handle firmware reset status during IF_UP.") Reviewed-by: Kalesh AP Reviewed-by: Andy Gospodarek Signed-off-by: Sreekanth Reddy Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250825175927.459987-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 207a8bb36ae5..1f5c06f1296b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12851,6 +12851,17 @@ static int bnxt_set_xps_mapping(struct bnxt *bp) return rc; } +static int bnxt_tx_nr_rings(struct bnxt *bp) +{ + return bp->num_tc ? bp->tx_nr_rings_per_tc * bp->num_tc : + bp->tx_nr_rings_per_tc; +} + +static int bnxt_tx_nr_rings_per_tc(struct bnxt *bp) +{ + return bp->num_tc ? bp->tx_nr_rings / bp->num_tc : bp->tx_nr_rings; +} + static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) { int rc = 0; @@ -16325,7 +16336,7 @@ static void bnxt_trim_dflt_sh_rings(struct bnxt *bp) bp->cp_nr_rings = min_t(int, bp->tx_nr_rings_per_tc, bp->rx_nr_rings); bp->rx_nr_rings = bp->cp_nr_rings; bp->tx_nr_rings_per_tc = bp->cp_nr_rings; - bp->tx_nr_rings = bp->tx_nr_rings_per_tc; + bp->tx_nr_rings = bnxt_tx_nr_rings(bp); } static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh) @@ -16357,7 +16368,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh) bnxt_trim_dflt_sh_rings(bp); else bp->cp_nr_rings = bp->tx_nr_rings_per_tc + bp->rx_nr_rings; - bp->tx_nr_rings = bp->tx_nr_rings_per_tc; + bp->tx_nr_rings = bnxt_tx_nr_rings(bp); avail_msix = bnxt_get_max_func_irqs(bp) - bp->cp_nr_rings; if (avail_msix >= BNXT_MIN_ROCE_CP_RINGS) { @@ -16370,7 +16381,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh) rc = __bnxt_reserve_rings(bp); if (rc && rc != -ENODEV) netdev_warn(bp->dev, "Unable to reserve tx rings\n"); - bp->tx_nr_rings_per_tc = bp->tx_nr_rings; + bp->tx_nr_rings_per_tc = bnxt_tx_nr_rings_per_tc(bp); if (sh) bnxt_trim_dflt_sh_rings(bp); @@ -16379,7 +16390,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh) rc = __bnxt_reserve_rings(bp); if (rc && rc != -ENODEV) netdev_warn(bp->dev, "2nd rings reservation failed.\n"); - bp->tx_nr_rings_per_tc = bp->tx_nr_rings; + bp->tx_nr_rings_per_tc = bnxt_tx_nr_rings_per_tc(bp); } if (BNXT_CHIP_TYPE_NITRO_A0(bp)) { bp->rx_nr_rings++; @@ -16413,7 +16424,7 @@ static int bnxt_init_dflt_ring_mode(struct bnxt *bp) if (rc) goto init_dflt_ring_err; - bp->tx_nr_rings_per_tc = bp->tx_nr_rings; + bp->tx_nr_rings_per_tc = bnxt_tx_nr_rings_per_tc(bp); bnxt_set_dflt_rfs(bp); -- cgit v1.2.3 From 1ee581c24dfdcbc6de25aac95a48c1f08e9a542c Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 25 Aug 2025 10:59:26 -0700 Subject: bnxt_en: Adjust TX rings if reservation is less than requested Before we accept an ethtool request to increase a resource (such as rings), we call the FW to check that the requested resource is likely available first before we commit. But it is still possible that the actual reservation or allocation can fail. The existing code is missing the logic to adjust the TX rings in case the reserved TX rings are less than requested. Add a warning message (a similar message for RX rings already exists) and add the logic to adjust the TX rings. Without this fix, the number of TX rings reported to the stack can exceed the actual TX rings and ethtool -l will report more than the actual TX rings. Fixes: 674f50a5b026 ("bnxt_en: Implement new method to reserve rings.") Reviewed-by: Kalesh AP Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250825175927.459987-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1f5c06f1296b..86fc9d340dab 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8024,6 +8024,11 @@ static int __bnxt_reserve_rings(struct bnxt *bp) hwr.rx = rx_rings << 1; tx_cp = bnxt_num_tx_to_cp(bp, hwr.tx); hwr.cp = sh ? max_t(int, tx_cp, rx_rings) : tx_cp + rx_rings; + if (hwr.tx != bp->tx_nr_rings) { + netdev_warn(bp->dev, + "Able to reserve only %d out of %d requested TX rings\n", + hwr.tx, bp->tx_nr_rings); + } bp->tx_nr_rings = hwr.tx; /* If we cannot reserve all the RX rings, reset the RSS map only @@ -12879,6 +12884,13 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) if (rc) return rc; + /* Make adjustments if reserved TX rings are less than requested */ + bp->tx_nr_rings -= bp->tx_nr_rings_xdp; + bp->tx_nr_rings_per_tc = bnxt_tx_nr_rings_per_tc(bp); + if (bp->tx_nr_rings_xdp) { + bp->tx_nr_rings_xdp = bp->tx_nr_rings_per_tc; + bp->tx_nr_rings += bp->tx_nr_rings_xdp; + } rc = bnxt_alloc_mem(bp, irq_re_init); if (rc) { netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc); -- cgit v1.2.3 From b4fc8faacfea2538184a1dbd616ae9447a361f3d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 25 Aug 2025 10:59:27 -0700 Subject: bnxt_en: Fix stats context reservation logic The HW resource reservation logic allows the L2 driver to use the RoCE resources if the RoCE driver is not registered. When calculating the stats contexts available for L2, we should not blindly subtract the stats contexts reserved for RoCE unless the RoCE driver is registered. This bug may cause the L2 rings to be less than the number requested when we are close to running out of stats contexts. Fixes: 2e4592dc9bee ("bnxt_en: Change MSIX/NQs allocation policy") Reviewed-by: Kalesh AP Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250825175927.459987-4-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 86fc9d340dab..31e3d825b4bc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8016,7 +8016,8 @@ static int __bnxt_reserve_rings(struct bnxt *bp) } rx_rings = min_t(int, rx_rings, hwr.grp); hwr.cp = min_t(int, hwr.cp, bp->cp_nr_rings); - if (hwr.stat > bnxt_get_ulp_stat_ctxs(bp)) + if (bnxt_ulp_registered(bp->edev) && + hwr.stat > bnxt_get_ulp_stat_ctxs(bp)) hwr.stat -= bnxt_get_ulp_stat_ctxs(bp); hwr.cp = min_t(int, hwr.cp, hwr.stat); rc = bnxt_trim_rings(bp, &rx_rings, &hwr.tx, hwr.cp, sh); -- cgit v1.2.3 From 2c0a959bebdc1ada13cf9a8242f177c5400299e6 Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Mon, 25 Aug 2025 17:34:24 +0300 Subject: net/mlx5: HWS, Fix memory leak in hws_pool_buddy_init error path In the error path of hws_pool_buddy_init(), the buddy allocator cleanup doesn't free the allocator structure itself, causing a memory leak. Add the missing kfree() to properly release all allocated memory. Fixes: c61afff94373 ("net/mlx5: HWS, added memory management handling") Signed-off-by: Lama Kayal Reviewed-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index 7e37d6e9eb83..7b5071c3df36 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -124,6 +124,7 @@ static int hws_pool_buddy_init(struct mlx5hws_pool *pool) mlx5hws_err(pool->ctx, "Failed to create resource type: %d size %zu\n", pool->type, pool->alloc_log_sz); mlx5hws_buddy_cleanup(buddy); + kfree(buddy); return -ENOMEM; } -- cgit v1.2.3 From a630f83592cdad1253523a1b760cfe78fef6cd9c Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Mon, 25 Aug 2025 17:34:25 +0300 Subject: net/mlx5: HWS, Fix memory leak in hws_action_get_shared_stc_nic error flow When an invalid stc_type is provided, the function allocates memory for shared_stc but jumps to unlock_and_out without freeing it, causing a memory leak. Fix by jumping to free_shared_stc label instead to ensure proper cleanup. Fixes: 504e536d9010 ("net/mlx5: HWS, added actions handling") Signed-off-by: Lama Kayal Reviewed-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index 396804369b00..6b36a4a7d895 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -117,7 +117,7 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx, mlx5hws_err(ctx, "No such stc_type: %d\n", stc_type); pr_warn("HWS: Invalid stc_type: %d\n", stc_type); ret = -EINVAL; - goto unlock_and_out; + goto free_shared_stc; } ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, tbl_type, -- cgit v1.2.3 From 24b6e53140475b56cadcccd4e82a93aa5bacf1eb Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Mon, 25 Aug 2025 17:34:26 +0300 Subject: net/mlx5: HWS, Fix uninitialized variables in mlx5hws_pat_calc_nop error flow In mlx5hws_pat_calc_nop(), src_field and dst_field are passed to hws_action_modify_get_target_fields() which should set their values. However, if an invalid action type is encountered, these variables remain uninitialized and are later used to update prev_src_field and prev_dst_field. Initialize both variables to INVALID_FIELD to ensure they have defined values in all code paths. Fixes: 01e035fd0380 ("net/mlx5: HWS, handle modify header actions dependency") Signed-off-by: Lama Kayal Reviewed-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c index 51e4c551e0ef..622fd579f140 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c @@ -527,7 +527,6 @@ int mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions, u32 *nop_locations, __be64 *new_pat) { u16 prev_src_field = INVALID_FIELD, prev_dst_field = INVALID_FIELD; - u16 src_field, dst_field; u8 action_type; bool dependent; size_t i, j; @@ -539,6 +538,9 @@ int mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions, return 0; for (i = 0, j = 0; i < num_actions; i++, j++) { + u16 src_field = INVALID_FIELD; + u16 dst_field = INVALID_FIELD; + if (j >= max_actions) return -EINVAL; -- cgit v1.2.3 From 00a50e4e8974cbf5d6a1dc91cfa5cce4aa7af05a Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Mon, 25 Aug 2025 17:34:27 +0300 Subject: net/mlx5: HWS, Fix pattern destruction in mlx5hws_pat_get_pattern error path In mlx5hws_pat_get_pattern(), when mlx5hws_pat_add_pattern_to_cache() fails, the function attempts to clean up the pattern created by mlx5hws_cmd_header_modify_pattern_create(). However, it incorrectly uses *pattern_id which hasn't been set yet, instead of the local ptrn_id variable that contains the actual pattern ID. This results in attempting to destroy a pattern using uninitialized data from the output parameter, rather than the valid pattern ID returned by the firmware. Use ptrn_id instead of *pattern_id in the cleanup path to properly destroy the created pattern. Fixes: aefc15a0fa1c ("net/mlx5: HWS, added modify header pattern and args handling") Signed-off-by: Lama Kayal Reviewed-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c index 622fd579f140..d56271a9e4f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c @@ -279,7 +279,7 @@ int mlx5hws_pat_get_pattern(struct mlx5hws_context *ctx, return ret; clean_pattern: - mlx5hws_cmd_header_modify_pattern_destroy(ctx->mdev, *pattern_id); + mlx5hws_cmd_header_modify_pattern_destroy(ctx->mdev, ptrn_id); out_unlock: mutex_unlock(&ctx->pattern_cache->lock); return ret; -- cgit v1.2.3 From 34cc6a54914f478c93e176450fae6313404f9f74 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 25 Aug 2025 17:34:28 +0300 Subject: net/mlx5: Reload auxiliary drivers on fw_activate The devlink reload fw_activate command performs firmware activation followed by driver reload, while devlink reload driver_reinit triggers only driver reload. However, the driver reload logic differs between the two modes, as on driver_reinit mode mlx5 also reloads auxiliary drivers, while in fw_activate mode the auxiliary drivers are suspended where applicable. Additionally, following the cited commit, if the device has multiple PFs, the behavior during fw_activate may vary between PFs: one PF may suspend auxiliary drivers, while another reloads them. Align devlink dev reload fw_activate behavior with devlink dev reload driver_reinit, to reload all auxiliary drivers. Fixes: 72ed5d5624af ("net/mlx5: Suspend auxiliary devices only in case of PCI device suspend") Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Reviewed-by: Akiva Goldberger Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 3ffa3fbacd16..26091e7536d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -160,7 +160,7 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli if (err) return err; - mlx5_unload_one_devl_locked(dev, true); + mlx5_unload_one_devl_locked(dev, false); err = mlx5_health_wait_pci_up(dev); if (err) NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset"); -- cgit v1.2.3 From 902a8bc23a24882200f57cadc270e15a2cfaf2bb Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 25 Aug 2025 17:34:29 +0300 Subject: net/mlx5: Fix lockdep assertion on sync reset unload event Fix lockdep assertion triggered during sync reset unload event. When the sync reset flow is initiated using the devlink reload fw_activate option, the PF already holds the devlink lock while handling unload event. In this case, delegate sync reset unload event handling back to the devlink callback process to avoid double-locking and resolve the lockdep warning. Kernel log: WARNING: CPU: 9 PID: 1578 at devl_assert_locked+0x31/0x40 [...] Call Trace: mlx5_unload_one_devl_locked+0x2c/0xc0 [mlx5_core] mlx5_sync_reset_unload_event+0xaf/0x2f0 [mlx5_core] process_one_work+0x222/0x640 worker_thread+0x199/0x350 kthread+0x10b/0x230 ? __pfx_worker_thread+0x10/0x10 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x8e/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Fixes: 7a9770f1bfea ("net/mlx5: Handle sync reset unload event") Signed-off-by: Moshe Shemesh Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-7-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 120 ++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h | 1 + 3 files changed, 69 insertions(+), 54 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 26091e7536d3..2c0e0c16ca90 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -160,7 +160,7 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli if (err) return err; - mlx5_unload_one_devl_locked(dev, false); + mlx5_sync_reset_unload_flow(dev, true); err = mlx5_health_wait_pci_up(dev); if (err) NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 69933addd921..38b9b184ae01 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -12,7 +12,8 @@ enum { MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, MLX5_FW_RESET_FLAGS_PENDING_COMP, MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, - MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED + MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, + MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, }; struct mlx5_fw_reset { @@ -219,7 +220,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false); } -static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded) +static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev) { struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; struct devlink *devlink = priv_to_devlink(dev); @@ -228,8 +229,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) { complete(&fw_reset->done); } else { - if (!unloaded) - mlx5_unload_one(dev, false); + mlx5_sync_reset_unload_flow(dev, false); if (mlx5_health_wait_pci_up(dev)) mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); else @@ -272,7 +272,7 @@ static void mlx5_sync_reset_reload_work(struct work_struct *work) mlx5_sync_reset_clear_reset_requested(dev, false); mlx5_enter_error_state(dev, true); - mlx5_fw_reset_complete_reload(dev, false); + mlx5_fw_reset_complete_reload(dev); } #define MLX5_RESET_POLL_INTERVAL (HZ / 10) @@ -586,6 +586,65 @@ static int mlx5_sync_pci_reset(struct mlx5_core_dev *dev, u8 reset_method) return err; } +void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + unsigned long timeout; + int poll_freq = 20; + bool reset_action; + u8 rst_state; + int err; + + if (locked) + mlx5_unload_one_devl_locked(dev, false); + else + mlx5_unload_one(dev, false); + + if (!test_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags)) + return; + + mlx5_set_fw_rst_ack(dev); + mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n"); + + reset_action = false; + timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD)); + do { + rst_state = mlx5_get_fw_rst_state(dev); + if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ || + rst_state == MLX5_FW_RST_STATE_IDLE) { + reset_action = true; + break; + } + if (rst_state == MLX5_FW_RST_STATE_DROP_MODE) { + mlx5_core_info(dev, "Sync Reset Drop mode ack\n"); + mlx5_set_fw_rst_ack(dev); + poll_freq = 1000; + } + msleep(poll_freq); + } while (!time_after(jiffies, timeout)); + + if (!reset_action) { + mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n", + rst_state); + fw_reset->ret = -ETIMEDOUT; + goto done; + } + + mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", + rst_state); + if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) { + err = mlx5_sync_pci_reset(dev, fw_reset->reset_method); + if (err) { + mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", + err); + fw_reset->ret = err; + } + } + +done: + clear_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags); +} + static void mlx5_sync_reset_now_event(struct work_struct *work) { struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, @@ -613,17 +672,13 @@ static void mlx5_sync_reset_now_event(struct work_struct *work) mlx5_enter_error_state(dev, true); done: fw_reset->ret = err; - mlx5_fw_reset_complete_reload(dev, false); + mlx5_fw_reset_complete_reload(dev); } static void mlx5_sync_reset_unload_event(struct work_struct *work) { struct mlx5_fw_reset *fw_reset; struct mlx5_core_dev *dev; - unsigned long timeout; - int poll_freq = 20; - bool reset_action; - u8 rst_state; int err; fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work); @@ -632,6 +687,7 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work) if (mlx5_sync_reset_clear_reset_requested(dev, false)) return; + set_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags); mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n"); err = mlx5_cmd_fast_teardown_hca(dev); @@ -640,49 +696,7 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work) else mlx5_enter_error_state(dev, true); - if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) - mlx5_unload_one_devl_locked(dev, false); - else - mlx5_unload_one(dev, false); - - mlx5_set_fw_rst_ack(dev); - mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n"); - - reset_action = false; - timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD)); - do { - rst_state = mlx5_get_fw_rst_state(dev); - if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ || - rst_state == MLX5_FW_RST_STATE_IDLE) { - reset_action = true; - break; - } - if (rst_state == MLX5_FW_RST_STATE_DROP_MODE) { - mlx5_core_info(dev, "Sync Reset Drop mode ack\n"); - mlx5_set_fw_rst_ack(dev); - poll_freq = 1000; - } - msleep(poll_freq); - } while (!time_after(jiffies, timeout)); - - if (!reset_action) { - mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n", - rst_state); - fw_reset->ret = -ETIMEDOUT; - goto done; - } - - mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", rst_state); - if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) { - err = mlx5_sync_pci_reset(dev, fw_reset->reset_method); - if (err) { - mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", err); - fw_reset->ret = err; - } - } - -done: - mlx5_fw_reset_complete_reload(dev, true); + mlx5_fw_reset_complete_reload(dev); } static void mlx5_sync_reset_abort_event(struct work_struct *work) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h index ea527d06a85f..d5b28525c960 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h @@ -12,6 +12,7 @@ int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel, int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev); int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev); +void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked); int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev, struct netlink_ext_ack *extack); void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 26e42ec7712d392d561964514b1f253b1a96f42d Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 25 Aug 2025 17:34:30 +0300 Subject: net/mlx5: Nack sync reset when SFs are present If PF (Physical Function) has SFs (Sub-Functions), since the SFs are not taking part in the synchronization flow, sync reset can lead to fatal error on the SFs, as the function will be closed unexpectedly from the SF point of view. Add a check to prevent sync reset when there are SFs on a PF device which is not ECPF, as ECPF is teardowned gracefully before reset. Fixes: 92501fa6e421 ("net/mlx5: Ack on sync_reset_request only if PF can do reset_now") Signed-off-by: Moshe Shemesh Reviewed-by: Parav Pandit Reviewed-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-8-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 6 ++++++ drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c | 10 ++++++++++ drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h | 6 ++++++ 3 files changed, 22 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 38b9b184ae01..22995131824a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -6,6 +6,7 @@ #include "fw_reset.h" #include "diag/fw_tracer.h" #include "lib/tout.h" +#include "sf/sf.h" enum { MLX5_FW_RESET_FLAGS_RESET_REQUESTED, @@ -428,6 +429,11 @@ static bool mlx5_is_reset_now_capable(struct mlx5_core_dev *dev, return false; } + if (!mlx5_core_is_ecpf(dev) && !mlx5_sf_table_empty(dev)) { + mlx5_core_warn(dev, "SFs should be removed before reset\n"); + return false; + } + #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE) if (reset_method != MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET) { err = mlx5_check_hotplug_interrupt(dev, bridge); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c index 0864ba625c07..3304f25cc805 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c @@ -518,3 +518,13 @@ void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) WARN_ON(!xa_empty(&table->function_ids)); kfree(table); } + +bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev) +{ + struct mlx5_sf_table *table = dev->priv.sf_table; + + if (!table) + return true; + + return xa_empty(&table->function_ids); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h index 860f9ddb7107..89559a37997a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h @@ -17,6 +17,7 @@ void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev); int mlx5_sf_table_init(struct mlx5_core_dev *dev); void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev); +bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev); int mlx5_devlink_sf_port_new(struct devlink *devlink, const struct devlink_port_new_attrs *add_attr, @@ -61,6 +62,11 @@ static inline void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) { } +static inline bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev) +{ + return true; +} + #endif #endif -- cgit v1.2.3 From cf9a8627b9a369ba01d37be6f71b297beb688faa Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 25 Aug 2025 17:34:31 +0300 Subject: net/mlx5: Prevent flow steering mode changes in switchdev mode Changing flow steering modes is not allowed when eswitch is in switchdev mode. This fix ensures that any steering mode change, including to firmware steering, is correctly blocked while eswitch mode is switchdev. Fixes: e890acd5ff18 ("net/mlx5: Add devlink flow_steering_mode parameter") Signed-off-by: Moshe Shemesh Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-9-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index d87392360dbd..cb165085a4c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -3734,6 +3734,13 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id, char *value = val.vstr; u8 eswitch_mode; + eswitch_mode = mlx5_eswitch_mode(dev); + if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Changing fs mode is not supported when eswitch offloads enabled."); + return -EOPNOTSUPP; + } + if (!strcmp(value, "dmfs")) return 0; @@ -3759,14 +3766,6 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id, return -EINVAL; } - eswitch_mode = mlx5_eswitch_mode(dev); - if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) { - NL_SET_ERR_MSG_FMT_MOD(extack, - "Moving to %s is not supported when eswitch offloads enabled.", - value); - return -EOPNOTSUPP; - } - return 0; } -- cgit v1.2.3 From ceddedc969f0532b7c62ca971ee50d519d2bc0cb Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Mon, 25 Aug 2025 17:34:32 +0300 Subject: net/mlx5e: Update and set Xon/Xoff upon MTU set Xon/Xoff sizes are derived from calculation that include the MTU size. Set Xon/Xoff when MTU is set. If Xon/Xoff fails, set the previous MTU. Fixes: 0696d60853d5 ("net/mlx5e: Receive buffer configuration") Signed-off-by: Alexei Lazar Reviewed-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-10-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en/port_buffer.h | 12 ++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 17 ++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h index f4a19ffbb641..66d276a1be83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h @@ -66,11 +66,23 @@ struct mlx5e_port_buffer { struct mlx5e_bufferx_reg buffer[MLX5E_MAX_NETWORK_BUFFER]; }; +#ifdef CONFIG_MLX5_CORE_EN_DCB int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, u32 change, unsigned int mtu, struct ieee_pfc *pfc, u32 *buffer_size, u8 *prio2buffer); +#else +static inline int +mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, + u32 change, unsigned int mtu, + void *pfc, + u32 *buffer_size, + u8 *prio2buffer) +{ + return 0; +} +#endif int mlx5e_port_query_buffer(struct mlx5e_priv *priv, struct mlx5e_port_buffer *port_buffer); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 21bb88c5d3dc..15eded36b872 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -49,6 +49,7 @@ #include "en.h" #include "en/dim.h" #include "en/txrx.h" +#include "en/port_buffer.h" #include "en_tc.h" #include "en_rep.h" #include "en_accel/ipsec.h" @@ -3040,9 +3041,11 @@ int mlx5e_set_dev_port_mtu(struct mlx5e_priv *priv) struct mlx5e_params *params = &priv->channels.params; struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; - u16 mtu; + u16 mtu, prev_mtu; int err; + mlx5e_query_mtu(mdev, params, &prev_mtu); + err = mlx5e_set_mtu(mdev, params, params->sw_mtu); if (err) return err; @@ -3052,6 +3055,18 @@ int mlx5e_set_dev_port_mtu(struct mlx5e_priv *priv) netdev_warn(netdev, "%s: VPort MTU %d is different than netdev mtu %d\n", __func__, mtu, params->sw_mtu); + if (mtu != prev_mtu && MLX5_BUFFER_SUPPORTED(mdev)) { + err = mlx5e_port_manual_buffer_config(priv, 0, mtu, + NULL, NULL, NULL); + if (err) { + netdev_warn(netdev, "%s: Failed to set Xon/Xoff values with MTU %d (err %d), setting back to previous MTU %d\n", + __func__, mtu, err, prev_mtu); + + mlx5e_set_mtu(mdev, params, prev_mtu); + return err; + } + } + params->sw_mtu = mtu; return 0; } -- cgit v1.2.3 From d24341740fe48add8a227a753e68b6eedf4b385a Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Mon, 25 Aug 2025 17:34:33 +0300 Subject: net/mlx5e: Update and set Xon/Xoff upon port speed set Xon/Xoff sizes are derived from calculations that include the port speed. These settings need to be updated and applied whenever the port speed is changed. The port speed is typically set after the physical link goes down and is negotiated as part of the link-up process between the two connected interfaces. Xon/Xoff parameters being updated at the point where the new negotiated speed is established. Fixes: 0696d60853d5 ("net/mlx5e: Receive buffer configuration") Signed-off-by: Alexei Lazar Reviewed-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-11-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 15eded36b872..e680673ffb72 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -139,6 +139,8 @@ void mlx5e_update_carrier(struct mlx5e_priv *priv) if (up) { netdev_info(priv->netdev, "Link up\n"); netif_carrier_on(priv->netdev); + mlx5e_port_manual_buffer_config(priv, 0, priv->netdev->mtu, + NULL, NULL, NULL); } else { netdev_info(priv->netdev, "Link down\n"); netif_carrier_off(priv->netdev); -- cgit v1.2.3 From aca0c31af61e0d5cf1675a0cbd29460b95ae693c Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Mon, 25 Aug 2025 17:34:34 +0300 Subject: net/mlx5e: Set local Xoff after FW update The local Xoff value is being set before the firmware (FW) update. In case of a failure where the FW is not updated with the new value, there is no fallback to the previous value. Update the local Xoff value after the FW has been successfully set. Fixes: 0696d60853d5 ("net/mlx5e: Receive buffer configuration") Signed-off-by: Alexei Lazar Reviewed-by: Tariq Toukan Reviewed-by: Dragos Tatulea Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250825143435.598584-12-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index 3efa8bf1d14e..4720523813b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -575,7 +575,6 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (err) return err; } - priv->dcbx.xoff = xoff; /* Apply the settings */ if (update_buffer) { @@ -584,6 +583,8 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, return err; } + priv->dcbx.xoff = xoff; + if (update_prio2buffer) err = mlx5e_port_set_priority2buffer(priv->mdev, prio2buffer); -- cgit v1.2.3 From 4f23382841e67174211271a454811dd17c0ef3c5 Mon Sep 17 00:00:00 2001 From: Rohan G Thomas Date: Mon, 25 Aug 2025 12:36:52 +0800 Subject: net: stmmac: xgmac: Do not enable RX FIFO Overflow interrupts Enabling RX FIFO Overflow interrupts is counterproductive and causes an interrupt storm when RX FIFO overflows. Disabling this interrupt has no side effect and eliminates interrupt storms when the RX FIFO overflows. Commit 8a7cb245cf28 ("net: stmmac: Do not enable RX FIFO overflow interrupts") disables RX FIFO overflow interrupts for DWMAC4 IP and removes the corresponding handling of this interrupt. This patch is doing the same thing for XGMAC IP. Fixes: 2142754f8b9c ("net: stmmac: Add MAC related callbacks for XGMAC2") Signed-off-by: Rohan G Thomas Reviewed-by: Matthew Gerlach Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250825-xgmac-minor-fixes-v3-1-c225fe4444c0@altera.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 5dcc95bc0ad2..7201a3884265 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -203,10 +203,6 @@ static void dwxgmac2_dma_rx_mode(struct stmmac_priv *priv, void __iomem *ioaddr, } writel(value, ioaddr + XGMAC_MTL_RXQ_OPMODE(channel)); - - /* Enable MTL RX overflow */ - value = readl(ioaddr + XGMAC_MTL_QINTEN(channel)); - writel(value | XGMAC_RXOIE, ioaddr + XGMAC_MTL_QINTEN(channel)); } static void dwxgmac2_dma_tx_mode(struct stmmac_priv *priv, void __iomem *ioaddr, -- cgit v1.2.3 From 42ef11b2bff5b6a2910c28d2ea47cc00e0fbcaec Mon Sep 17 00:00:00 2001 From: Rohan G Thomas Date: Mon, 25 Aug 2025 12:36:53 +0800 Subject: net: stmmac: xgmac: Correct supported speed modes Correct supported speed modes as per the XGMAC databook. Commit 9cb54af214a7 ("net: stmmac: Fix IP-cores specific MAC capabilities") removes support for 10M, 100M and 1000HD. 1000HD is not supported by XGMAC IP, but it does support 10M and 100M FD mode for XGMAC version >= 2_20, and it also supports 10M and 100M HD mode if the HDSEL bit is set in the MAC_HW_FEATURE0 reg. This commit enables support for 10M and 100M speed modes for XGMAC IP based on XGMAC version and MAC capabilities. Fixes: 9cb54af214a7 ("net: stmmac: Fix IP-cores specific MAC capabilities") Signed-off-by: Rohan G Thomas Reviewed-by: Matthew Gerlach Link: https://patch.msgid.link/20250825-xgmac-minor-fixes-v3-2-c225fe4444c0@altera.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 13 +++++++++++-- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c | 5 +++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 6cadf8de4fdf..00e929bf280b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -49,6 +49,14 @@ static void dwxgmac2_core_init(struct mac_device_info *hw, writel(XGMAC_INT_DEFAULT_EN, ioaddr + XGMAC_INT_EN); } +static void dwxgmac2_update_caps(struct stmmac_priv *priv) +{ + if (!priv->dma_cap.mbps_10_100) + priv->hw->link.caps &= ~(MAC_10 | MAC_100); + else if (!priv->dma_cap.half_duplex) + priv->hw->link.caps &= ~(MAC_10HD | MAC_100HD); +} + static void dwxgmac2_set_mac(void __iomem *ioaddr, bool enable) { u32 tx = readl(ioaddr + XGMAC_TX_CONFIG); @@ -1424,6 +1432,7 @@ static void dwxgmac2_set_arp_offload(struct mac_device_info *hw, bool en, const struct stmmac_ops dwxgmac210_ops = { .core_init = dwxgmac2_core_init, + .update_caps = dwxgmac2_update_caps, .set_mac = dwxgmac2_set_mac, .rx_ipc = dwxgmac2_rx_ipc, .rx_queue_enable = dwxgmac2_rx_queue_enable, @@ -1532,8 +1541,8 @@ int dwxgmac2_setup(struct stmmac_priv *priv) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | - MAC_1000FD | MAC_2500FD | MAC_5000FD | - MAC_10000FD; + MAC_10 | MAC_100 | MAC_1000FD | + MAC_2500FD | MAC_5000FD | MAC_10000FD; mac->link.duplex = 0; mac->link.speed10 = XGMAC_CONFIG_SS_10_MII; mac->link.speed100 = XGMAC_CONFIG_SS_100_MII; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c index 7201a3884265..4d6bb995d8d8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c @@ -382,8 +382,11 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv, static int dwxgmac2_get_hw_feature(void __iomem *ioaddr, struct dma_features *dma_cap) { + struct stmmac_priv *priv; u32 hw_cap; + priv = container_of(dma_cap, struct stmmac_priv, dma_cap); + /* MAC HW feature 0 */ hw_cap = readl(ioaddr + XGMAC_HW_FEATURE0); dma_cap->edma = (hw_cap & XGMAC_HWFEAT_EDMA) >> 31; @@ -406,6 +409,8 @@ static int dwxgmac2_get_hw_feature(void __iomem *ioaddr, dma_cap->vlhash = (hw_cap & XGMAC_HWFEAT_VLHASH) >> 4; dma_cap->half_duplex = (hw_cap & XGMAC_HWFEAT_HDSEL) >> 3; dma_cap->mbps_1000 = (hw_cap & XGMAC_HWFEAT_GMIISEL) >> 1; + if (dma_cap->mbps_1000 && priv->synopsys_id >= DWXGMAC_CORE_2_20) + dma_cap->mbps_10_100 = 1; /* MAC HW feature 1 */ hw_cap = readl(ioaddr + XGMAC_HW_FEATURE1); -- cgit v1.2.3 From b1eded580ab28119de0b0f21efe37ee2b4419144 Mon Sep 17 00:00:00 2001 From: Rohan G Thomas Date: Mon, 25 Aug 2025 12:36:54 +0800 Subject: net: stmmac: Set CIC bit only for TX queues with COE Currently, in the AF_XDP transmit paths, the CIC bit of TX Desc3 is set for all packets. Setting this bit for packets transmitting through queues that don't support checksum offloading causes the TX DMA to get stuck after transmitting some packets. This patch ensures the CIC bit of TX Desc3 is set only if the TX queue supports checksum offloading. Fixes: 132c32ee5bc0 ("net: stmmac: Add TX via XDP zero-copy socket") Signed-off-by: Rohan G Thomas Reviewed-by: Matthew Gerlach Link: https://patch.msgid.link/20250825-xgmac-minor-fixes-v3-3-c225fe4444c0@altera.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f1abf4242cd2..7b16d1207b80 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2584,6 +2584,7 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) struct netdev_queue *nq = netdev_get_tx_queue(priv->dev, queue); struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue]; struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[queue]; + bool csum = !priv->plat->tx_queues_cfg[queue].coe_unsupported; struct xsk_buff_pool *pool = tx_q->xsk_pool; unsigned int entry = tx_q->cur_tx; struct dma_desc *tx_desc = NULL; @@ -2671,7 +2672,7 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) } stmmac_prepare_tx_desc(priv, tx_desc, 1, xdp_desc.len, - true, priv->mode, true, true, + csum, priv->mode, true, true, xdp_desc.len); stmmac_enable_dma_transmission(priv, priv->ioaddr, queue); @@ -4983,6 +4984,7 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue, { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[queue]; struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue]; + bool csum = !priv->plat->tx_queues_cfg[queue].coe_unsupported; unsigned int entry = tx_q->cur_tx; struct dma_desc *tx_desc; dma_addr_t dma_addr; @@ -5034,7 +5036,7 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue, stmmac_set_desc_addr(priv, tx_desc, dma_addr); stmmac_prepare_tx_desc(priv, tx_desc, 1, xdpf->len, - true, priv->mode, true, true, + csum, priv->mode, true, true, xdpf->len); tx_q->tx_count_frames++; -- cgit v1.2.3 From 9448ccd853368582efa9db05db344f8bb9dffe0f Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Mon, 25 Aug 2025 04:56:27 -0700 Subject: net: hv_netvsc: fix loss of early receive events from host during channel open. The hv_netvsc driver currently enables NAPI after opening the primary and subchannels. This ordering creates a race: if the Hyper-V host places data in the host -> guest ring buffer and signals the channel before napi_enable() has been called, the channel callback will run but napi_schedule_prep() will return false. As a result, the NAPI poller never gets scheduled, the data in the ring buffer is not consumed, and the receive queue may remain permanently stuck until another interrupt happens to arrive. Fix this by enabling NAPI and registering it with the RX/TX queues before vmbus channel is opened. This guarantees that any early host signal after open will correctly trigger NAPI scheduling and the ring buffer will be drained. Fixes: 76bb5db5c749d ("netvsc: fix use after free on module removal") Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20250825115627.GA32189@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Jakub Kicinski --- drivers/net/hyperv/netvsc.c | 17 ++++++++--------- drivers/net/hyperv/rndis_filter.c | 23 ++++++++++++++++------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 720104661d7f..60a4629fe6ba 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1812,6 +1812,11 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, /* Enable NAPI handler before init callbacks */ netif_napi_add(ndev, &net_device->chan_table[0].napi, netvsc_poll); + napi_enable(&net_device->chan_table[0].napi); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, + &net_device->chan_table[0].napi); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, + &net_device->chan_table[0].napi); /* Open the channel */ device->channel->next_request_id_callback = vmbus_next_request_id; @@ -1831,12 +1836,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, /* Channel is opened */ netdev_dbg(ndev, "hv_netvsc channel opened successfully\n"); - napi_enable(&net_device->chan_table[0].napi); - netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, - &net_device->chan_table[0].napi); - netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, - &net_device->chan_table[0].napi); - /* Connect with the NetVsp */ ret = netvsc_connect_vsp(device, net_device, device_info); if (ret != 0) { @@ -1854,14 +1853,14 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, close: RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); - netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL); - netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL); - napi_disable(&net_device->chan_table[0].napi); /* Now, we can close the channel safely */ vmbus_close(device->channel); cleanup: + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL); + netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL); + napi_disable(&net_device->chan_table[0].napi); netif_napi_del(&net_device->chan_table[0].napi); cleanup2: diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 9e73959e61ee..c35f9685b6bf 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1252,17 +1252,26 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE; + /* Enable napi before opening the vmbus channel to avoid races + * as the host placing data on the host->guest ring may be left + * out if napi was not enabled. + */ + napi_enable(&nvchan->napi); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX, + &nvchan->napi); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX, + &nvchan->napi); + ret = vmbus_open(new_sc, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, nvchan); - if (ret == 0) { - napi_enable(&nvchan->napi); - netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX, - &nvchan->napi); - netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX, - &nvchan->napi); - } else { + if (ret != 0) { netdev_notice(ndev, "sub channel open failed: %d\n", ret); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX, + NULL); + netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX, + NULL); + napi_disable(&nvchan->napi); } if (atomic_inc_return(&nvscdev->open_chn) == nvscdev->num_chn) -- cgit v1.2.3 From dcb34659028f856c423a29ef9b4e2571d203444d Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Sat, 23 Aug 2025 17:58:55 +0900 Subject: net: rose: split remove and free operations in rose_remove_neigh() The current rose_remove_neigh() performs two distinct operations: 1. Removes rose_neigh from rose_neigh_list 2. Frees the rose_neigh structure Split these operations into separate functions to improve maintainability and prepare for upcoming refcount_t conversion. The timer cleanup remains in rose_remove_neigh() because free operations can be called from timer itself. This patch introduce rose_neigh_put() to handle the freeing of rose_neigh structures and modify rose_remove_neigh() to handle removal only. Signed-off-by: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250823085857.47674-2-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- include/net/rose.h | 8 ++++++++ net/rose/rose_route.c | 15 ++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/net/rose.h b/include/net/rose.h index 23267b4efcfa..174b4f605d84 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -151,6 +151,14 @@ struct rose_sock { #define rose_sk(sk) ((struct rose_sock *)(sk)) +static inline void rose_neigh_put(struct rose_neigh *rose_neigh) +{ + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); +} + /* af_rose.c */ extern ax25_address rose_callsign; extern int sysctl_rose_restart_request_timeout; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index b72bf8a08d48..0c44c416f485 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -234,20 +234,12 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); return; } while (s != NULL && s->next != NULL) { if (s->next == rose_neigh) { s->next = rose_neigh->next; - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); return; } @@ -331,8 +323,10 @@ static int rose_del_node(struct rose_route_struct *rose_route, if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; - if (rose_neigh->count == 0 && rose_neigh->use == 0) + if (rose_neigh->count == 0 && rose_neigh->use == 0) { rose_remove_neigh(rose_neigh); + rose_neigh_put(rose_neigh); + } rose_node->count--; @@ -513,6 +507,7 @@ void rose_rt_device_down(struct net_device *dev) } rose_remove_neigh(s); + rose_neigh_put(s); } spin_unlock_bh(&rose_neigh_list_lock); spin_unlock_bh(&rose_node_list_lock); @@ -569,6 +564,7 @@ static int rose_clear_routes(void) if (s->use == 0 && !s->loopback) { s->count = 0; rose_remove_neigh(s); + rose_neigh_put(s); } } @@ -1301,6 +1297,7 @@ void __exit rose_rt_free(void) rose_neigh = rose_neigh->next; rose_remove_neigh(s); + rose_neigh_put(s); } while (rose_node != NULL) { -- cgit v1.2.3 From d860d1faa6b2ce3becfdb8b0c2b048ad31800061 Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Sat, 23 Aug 2025 17:58:56 +0900 Subject: net: rose: convert 'use' field to refcount_t The 'use' field in struct rose_neigh is used as a reference counter but lacks atomicity. This can lead to race conditions where a rose_neigh structure is freed while still being referenced by other code paths. For example, when rose_neigh->use becomes zero during an ioctl operation via rose_rt_ioctl(), the structure may be removed while its timer is still active, potentially causing use-after-free issues. This patch changes the type of 'use' from unsigned short to refcount_t and updates all code paths to use rose_neigh_hold() and rose_neigh_put() which operate reference counts atomically. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250823085857.47674-3-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- include/net/rose.h | 18 +++++++++++++----- net/rose/af_rose.c | 13 +++++++------ net/rose/rose_in.c | 12 ++++++------ net/rose/rose_route.c | 33 ++++++++++++++++++--------------- net/rose/rose_timer.c | 2 +- 5 files changed, 45 insertions(+), 33 deletions(-) diff --git a/include/net/rose.h b/include/net/rose.h index 174b4f605d84..2b5491bbf39a 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -8,6 +8,7 @@ #ifndef _ROSE_H #define _ROSE_H +#include #include #include #include @@ -96,7 +97,7 @@ struct rose_neigh { ax25_cb *ax25; struct net_device *dev; unsigned short count; - unsigned short use; + refcount_t use; unsigned int number; char restarted; char dce_mode; @@ -151,12 +152,19 @@ struct rose_sock { #define rose_sk(sk) ((struct rose_sock *)(sk)) +static inline void rose_neigh_hold(struct rose_neigh *rose_neigh) +{ + refcount_inc(&rose_neigh->use); +} + static inline void rose_neigh_put(struct rose_neigh *rose_neigh) { - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); + if (refcount_dec_and_test(&rose_neigh->use)) { + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); + } } /* af_rose.c */ diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 4e72b636a46a..543f9e8ebb69 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -170,7 +170,7 @@ void rose_kill_by_neigh(struct rose_neigh *neigh) if (rose->neighbour == neigh) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose->neighbour = NULL; } } @@ -212,7 +212,7 @@ start: if (rose->device == dev) { rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); if (rose->neighbour) - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; } @@ -655,7 +655,7 @@ static int rose_release(struct socket *sock) break; case ROSE_STATE_2: - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); release_sock(sk); rose_disconnect(sk, 0, -1, -1); lock_sock(sk); @@ -823,6 +823,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose->lci = rose_new_lci(rose->neighbour); if (!rose->lci) { err = -ENETUNREACH; + rose_neigh_put(rose->neighbour); goto out_release; } @@ -834,12 +835,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le dev = rose_dev_first(); if (!dev) { err = -ENETUNREACH; + rose_neigh_put(rose->neighbour); goto out_release; } user = ax25_findbyuid(current_euid()); if (!user) { err = -EINVAL; + rose_neigh_put(rose->neighbour); dev_put(dev); goto out_release; } @@ -874,8 +877,6 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose->state = ROSE_STATE_1; - rose->neighbour->use++; - rose_write_internal(sk, ROSE_CALL_REQUEST); rose_start_heartbeat(sk); rose_start_t1timer(sk); @@ -1077,7 +1078,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros GFP_ATOMIC); make_rose->facilities = facilities; - make_rose->neighbour->use++; + rose_neigh_hold(make_rose->neighbour); if (rose_sk(sk)->defer) { make_rose->state = ROSE_STATE_5; diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 3e99181e759f..0276b393f0e5 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -56,7 +56,7 @@ static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -79,12 +79,12 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; case ROSE_CLEAR_CONFIRMATION: rose_disconnect(sk, 0, -1, -1); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -121,7 +121,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; case ROSE_RR: @@ -234,7 +234,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -254,7 +254,7 @@ static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int framety if (frametype == ROSE_CLEAR_REQUEST) { rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose_sk(sk)->neighbour->use--; + rose_neigh_put(rose_sk(sk)->neighbour); } return 0; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 0c44c416f485..8efb9033c057 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -93,11 +93,11 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, rose_neigh->ax25 = NULL; rose_neigh->dev = dev; rose_neigh->count = 0; - rose_neigh->use = 0; rose_neigh->dce_mode = 0; rose_neigh->loopback = 0; rose_neigh->number = rose_neigh_no++; rose_neigh->restarted = 0; + refcount_set(&rose_neigh->use, 1); skb_queue_head_init(&rose_neigh->queue); @@ -255,10 +255,10 @@ static void rose_remove_route(struct rose_route *rose_route) struct rose_route *s; if (rose_route->neigh1 != NULL) - rose_route->neigh1->use--; + rose_neigh_put(rose_route->neigh1); if (rose_route->neigh2 != NULL) - rose_route->neigh2->use--; + rose_neigh_put(rose_route->neigh2); if ((s = rose_route_list) == rose_route) { rose_route_list = rose_route->next; @@ -323,7 +323,7 @@ static int rose_del_node(struct rose_route_struct *rose_route, if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; - if (rose_neigh->count == 0 && rose_neigh->use == 0) { + if (rose_neigh->count == 0) { rose_remove_neigh(rose_neigh); rose_neigh_put(rose_neigh); } @@ -375,11 +375,11 @@ void rose_add_loopback_neigh(void) sn->ax25 = NULL; sn->dev = NULL; sn->count = 0; - sn->use = 0; sn->dce_mode = 1; sn->loopback = 1; sn->number = rose_neigh_no++; sn->restarted = 1; + refcount_set(&sn->use, 1); skb_queue_head_init(&sn->queue); @@ -561,8 +561,7 @@ static int rose_clear_routes(void) s = rose_neigh; rose_neigh = rose_neigh->next; - if (s->use == 0 && !s->loopback) { - s->count = 0; + if (!s->loopback) { rose_remove_neigh(s); rose_neigh_put(s); } @@ -680,6 +679,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (node->neighbour[i]->restarted) { res = node->neighbour[i]; + rose_neigh_hold(node->neighbour[i]); goto out; } } @@ -691,6 +691,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; + rose_neigh_hold(node->neighbour[i]); goto out; } failed = 1; @@ -780,13 +781,13 @@ static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh) } if (rose_route->neigh1 == rose_neigh) { - rose_route->neigh1->use--; + rose_neigh_put(rose_route->neigh1); rose_route->neigh1 = NULL; rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0); } if (rose_route->neigh2 == rose_neigh) { - rose_route->neigh2->use--; + rose_neigh_put(rose_route->neigh2); rose_route->neigh2 = NULL; rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0); } @@ -915,7 +916,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_clear_queues(sk); rose->cause = ROSE_NETWORK_CONGESTION; rose->diagnostic = 0; - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose->neighbour = NULL; rose->lci = 0; rose->state = ROSE_STATE_0; @@ -1040,12 +1041,12 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) if ((new_lci = rose_new_lci(new_neigh)) == 0) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71); - goto out; + goto put_neigh; } if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120); - goto out; + goto put_neigh; } rose_route->lci1 = lci; @@ -1058,8 +1059,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_route->lci2 = new_lci; rose_route->neigh2 = new_neigh; - rose_route->neigh1->use++; - rose_route->neigh2->use++; + rose_neigh_hold(rose_route->neigh1); + rose_neigh_hold(rose_route->neigh2); rose_route->next = rose_route_list; rose_route_list = rose_route; @@ -1071,6 +1072,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_transmit_link(skb, rose_route->neigh2); res = 1; +put_neigh: + rose_neigh_put(new_neigh); out: spin_unlock_bh(&rose_route_list_lock); spin_unlock_bh(&rose_neigh_list_lock); @@ -1186,7 +1189,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v) (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, - rose_neigh->use, + refcount_read(&rose_neigh->use) - 1, (rose_neigh->dce_mode) ? "DCE" : "DTE", (rose_neigh->restarted) ? "yes" : "no", ax25_display_timer(&rose_neigh->t0timer) / HZ, diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 020369c49587..bb60a1654d61 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -180,7 +180,7 @@ static void rose_timer_expiry(struct timer_list *t) break; case ROSE_STATE_2: /* T3 */ - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose_disconnect(sk, ETIMEDOUT, -1, -1); break; -- cgit v1.2.3 From da9c9c877597170b929a6121a68dcd3dd9a80f45 Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Sat, 23 Aug 2025 17:58:57 +0900 Subject: net: rose: include node references in rose_neigh refcount Current implementation maintains two separate reference counting mechanisms: the 'count' field in struct rose_neigh tracks references from rose_node structures, while the 'use' field (now refcount_t) tracks references from rose_sock. This patch merges these two reference counting systems using 'use' field for proper reference management. Specifically, this patch adds incrementing and decrementing of rose_neigh->use when rose_neigh->count is incremented or decremented. This patch also modifies rose_rt_free(), rose_rt_device_down() and rose_clear_route() to properly release references to rose_neigh objects before freeing a rose_node through rose_remove_node(). These changes ensure rose_neigh structures are properly freed only when all references, including those from rose_node structures, are released. As a result, this resolves a slab-use-after-free issue reported by Syzbot. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+942297eecf7d2d61d1f1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=942297eecf7d2d61d1f1 Signed-off-by: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250823085857.47674-4-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- net/rose/rose_route.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 8efb9033c057..1adee1fbc2ed 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -178,6 +178,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, } } rose_neigh->count++; + rose_neigh_hold(rose_neigh); goto out; } @@ -187,6 +188,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, rose_node->neighbour[rose_node->count] = rose_neigh; rose_node->count++; rose_neigh->count++; + rose_neigh_hold(rose_neigh); } out: @@ -322,6 +324,7 @@ static int rose_del_node(struct rose_route_struct *rose_route, for (i = 0; i < rose_node->count; i++) { if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; + rose_neigh_put(rose_neigh); if (rose_neigh->count == 0) { rose_remove_neigh(rose_neigh); @@ -430,6 +433,7 @@ int rose_add_loopback_node(const rose_address *address) rose_node_list = rose_node; rose_loopback_neigh->count++; + rose_neigh_hold(rose_loopback_neigh); out: spin_unlock_bh(&rose_node_list_lock); @@ -461,6 +465,7 @@ void rose_del_loopback_node(const rose_address *address) rose_remove_node(rose_node); rose_loopback_neigh->count--; + rose_neigh_put(rose_loopback_neigh); out: spin_unlock_bh(&rose_node_list_lock); @@ -500,6 +505,7 @@ void rose_rt_device_down(struct net_device *dev) memmove(&t->neighbour[i], &t->neighbour[i + 1], sizeof(t->neighbour[0]) * (t->count - i)); + rose_neigh_put(s); } if (t->count <= 0) @@ -543,6 +549,7 @@ static int rose_clear_routes(void) { struct rose_neigh *s, *rose_neigh; struct rose_node *t, *rose_node; + int i; spin_lock_bh(&rose_node_list_lock); spin_lock_bh(&rose_neigh_list_lock); @@ -553,8 +560,12 @@ static int rose_clear_routes(void) while (rose_node != NULL) { t = rose_node; rose_node = rose_node->next; - if (!t->loopback) + + if (!t->loopback) { + for (i = 0; i < rose_node->count; i++) + rose_neigh_put(t->neighbour[i]); rose_remove_node(t); + } } while (rose_neigh != NULL) { @@ -1189,7 +1200,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v) (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, - refcount_read(&rose_neigh->use) - 1, + refcount_read(&rose_neigh->use) - rose_neigh->count - 1, (rose_neigh->dce_mode) ? "DCE" : "DTE", (rose_neigh->restarted) ? "yes" : "no", ax25_display_timer(&rose_neigh->t0timer) / HZ, @@ -1294,6 +1305,7 @@ void __exit rose_rt_free(void) struct rose_neigh *s, *rose_neigh = rose_neigh_list; struct rose_node *t, *rose_node = rose_node_list; struct rose_route *u, *rose_route = rose_route_list; + int i; while (rose_neigh != NULL) { s = rose_neigh; @@ -1307,6 +1319,8 @@ void __exit rose_rt_free(void) t = rose_node; rose_node = rose_node->next; + for (i = 0; i < t->count; i++) + rose_neigh_put(t->neighbour[i]); rose_remove_node(t); } -- cgit v1.2.3 From bcd6f8954dc4a3aa32edda5602e43a0174dc8f0f Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Tue, 26 Aug 2025 14:50:46 -0700 Subject: MAINTAINERS: rmnet: Update email addresses Switch to oss.qualcomm.com ids. Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Link: https://patch.msgid.link/20250826215046.865530-1-subash.a.kasiviswanathan@oss.qualcomm.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c5b47955d2a6..3337577ce545 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20847,8 +20847,8 @@ S: Maintained F: drivers/firmware/qcom/qcom_qseecom_uefisecapp.c QUALCOMM RMNET DRIVER -M: Subash Abhinov Kasiviswanathan -M: Sean Tranchetti +M: Subash Abhinov Kasiviswanathan +M: Sean Tranchetti L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst -- cgit v1.2.3 From 2e8750469242cad8f01f320131fd5a6f540dbb99 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 14:13:14 +0000 Subject: sctp: initialize more fields in sctp_v6_from_sk() syzbot found that sin6_scope_id was not properly initialized, leading to undefined behavior. Clear sin6_scope_id and sin6_flowinfo. BUG: KMSAN: uninit-value in __sctp_v6_cmp_addr+0x887/0x8c0 net/sctp/ipv6.c:649 __sctp_v6_cmp_addr+0x887/0x8c0 net/sctp/ipv6.c:649 sctp_inet6_cmp_addr+0x4f2/0x510 net/sctp/ipv6.c:983 sctp_bind_addr_conflict+0x22a/0x3b0 net/sctp/bind_addr.c:390 sctp_get_port_local+0x21eb/0x2440 net/sctp/socket.c:8452 sctp_get_port net/sctp/socket.c:8523 [inline] sctp_listen_start net/sctp/socket.c:8567 [inline] sctp_inet_listen+0x710/0xfd0 net/sctp/socket.c:8636 __sys_listen_socket net/socket.c:1912 [inline] __sys_listen net/socket.c:1927 [inline] __do_sys_listen net/socket.c:1932 [inline] __se_sys_listen net/socket.c:1930 [inline] __x64_sys_listen+0x343/0x4c0 net/socket.c:1930 x64_sys_call+0x271d/0x3e20 arch/x86/include/generated/asm/syscalls_64.h:51 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xd9/0x210 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Local variable addr.i.i created at: sctp_get_port net/sctp/socket.c:8515 [inline] sctp_listen_start net/sctp/socket.c:8567 [inline] sctp_inet_listen+0x650/0xfd0 net/sctp/socket.c:8636 __sys_listen_socket net/socket.c:1912 [inline] __sys_listen net/socket.c:1927 [inline] __do_sys_listen net/socket.c:1932 [inline] __se_sys_listen net/socket.c:1930 [inline] __x64_sys_listen+0x343/0x4c0 net/socket.c:1930 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+e69f06a0f30116c68056@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68adc0a2.050a0220.37038e.00c4.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Acked-by: Xin Long Link: https://patch.msgid.link/20250826141314.1802610-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sctp/ipv6.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 3336dcfb4515..568ff8797c39 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -547,7 +547,9 @@ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = 0; + addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = sk->sk_v6_rcv_saddr; + addr->v6.sin6_scope_id = 0; } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ -- cgit v1.2.3 From 9b8c88f875c04d4cb9111bd5dd9291c7e9691bf5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 13:44:35 +0000 Subject: l2tp: do not use sock_hold() in pppol2tp_session_get_sock() pppol2tp_session_get_sock() is using RCU, it must be ready for sk_refcnt being zero. Commit ee40fb2e1eb5 ("l2tp: protect sock pointer of struct pppol2tp_session with RCU") was correct because it had a call_rcu(..., pppol2tp_put_sk) which was later removed in blamed commit. pppol2tp_recv() can use pppol2tp_session_get_sock() as well. Fixes: c5cbaef992d6 ("l2tp: refactor ppp socket/session relationship") Signed-off-by: Eric Dumazet Cc: James Chapman Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/20250826134435.1683435-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/l2tp/l2tp_ppp.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index fc5c2fd8f34c..5e12e7ce17d8 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -129,22 +129,12 @@ static const struct ppp_channel_ops pppol2tp_chan_ops = { static const struct proto_ops pppol2tp_ops; -/* Retrieves the pppol2tp socket associated to a session. - * A reference is held on the returned socket, so this function must be paired - * with sock_put(). - */ +/* Retrieves the pppol2tp socket associated to a session. */ static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) { struct pppol2tp_session *ps = l2tp_session_priv(session); - struct sock *sk; - - rcu_read_lock(); - sk = rcu_dereference(ps->sk); - if (sk) - sock_hold(sk); - rcu_read_unlock(); - return sk; + return rcu_dereference(ps->sk); } /* Helpers to obtain tunnel/session contexts from sockets. @@ -206,14 +196,13 @@ end: static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { - struct pppol2tp_session *ps = l2tp_session_priv(session); - struct sock *sk = NULL; + struct sock *sk; /* If the socket is bound, send it in to PPP's input queue. Otherwise * queue it on the session socket. */ rcu_read_lock(); - sk = rcu_dereference(ps->sk); + sk = pppol2tp_session_get_sock(session); if (!sk) goto no_sock; @@ -510,13 +499,14 @@ static void pppol2tp_show(struct seq_file *m, void *arg) struct l2tp_session *session = arg; struct sock *sk; + rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); - sock_put(sk); } + rcu_read_unlock(); } static void pppol2tp_session_init(struct l2tp_session *session) @@ -1530,6 +1520,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) port = ntohs(inet->inet_sport); } + rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (sk) { state = sk->sk_state; @@ -1565,8 +1556,8 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); - sock_put(sk); } + rcu_read_unlock(); } static int pppol2tp_seq_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 1cc8a5b534e5f9b5e129e54ee2e63c9f5da4f39a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 17:21:49 +0000 Subject: net: rose: fix a typo in rose_clear_routes() syzbot crashed in rose_clear_routes(), after a recent patch typo. KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 0 UID: 0 PID: 10591 Comm: syz.3.1856 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:rose_clear_routes net/rose/rose_route.c:565 [inline] RIP: 0010:rose_rt_ioctl+0x162/0x1250 net/rose/rose_route.c:760 rose_ioctl+0x3ce/0x8b0 net/rose/af_rose.c:1381 sock_do_ioctl+0xd9/0x300 net/socket.c:1238 sock_ioctl+0x576/0x790 net/socket.c:1359 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:598 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: da9c9c877597 ("net: rose: include node references in rose_neigh refcount") Reported-by: syzbot+2eb8d1719f7cfcfa6840@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68af3e29.a70a0220.3cafd4.002e.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250827172149.5359-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/rose/rose_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 1adee1fbc2ed..a1e9b05ef6f5 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -562,7 +562,7 @@ static int rose_clear_routes(void) rose_node = rose_node->next; if (!t->loopback) { - for (i = 0; i < rose_node->count; i++) + for (i = 0; i < t->count; i++) rose_neigh_put(t->neighbour[i]); rose_remove_node(t); } -- cgit v1.2.3 From 2ddaa562b465921a5d1da3fc939993b92b953e20 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 25 Aug 2025 15:56:06 -0700 Subject: fbnic: Fixup rtnl_lock and devl_lock handling related to mailbox code The exception handling path for the __fbnic_pm_resume function had a bug in that it was taking the devlink lock and then exiting to exception handling instead of waiting until after it released the lock to do so. In order to handle that I am swapping the placement of the unlock and the exception handling jump to label so that we don't trigger a deadlock by holding the lock longer than we need to. In addition this change applies the same ordering to the rtnl_lock/unlock calls in the same function as it should make the code easier to follow if it adheres to a consistent pattern. Fixes: 82534f446daa ("eth: fbnic: Add devlink dev flash support") Signed-off-by: Alexander Duyck Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/175616256667.1963577.5543500806256052549.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index b70e4cadb37b..a7784deea88f 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -443,11 +443,10 @@ static int __fbnic_pm_resume(struct device *dev) /* Re-enable mailbox */ err = fbnic_fw_request_mbx(fbd); + devl_unlock(priv_to_devlink(fbd)); if (err) goto err_free_irqs; - devl_unlock(priv_to_devlink(fbd)); - /* Only send log history if log buffer is empty to prevent duplicate * log entries. */ @@ -464,20 +463,20 @@ static int __fbnic_pm_resume(struct device *dev) rtnl_lock(); - if (netif_running(netdev)) { + if (netif_running(netdev)) err = __fbnic_open(fbn); - if (err) - goto err_free_mbx; - } rtnl_unlock(); + if (err) + goto err_free_mbx; return 0; err_free_mbx: fbnic_fw_log_disable(fbd); - rtnl_unlock(); + devl_lock(priv_to_devlink(fbd)); fbnic_fw_free_mbx(fbd); + devl_unlock(priv_to_devlink(fbd)); err_free_irqs: fbnic_free_irqs(fbd); err_invalidate_uc_addr: -- cgit v1.2.3 From 6ede14a2c6365e7e5d855643c7c8390b5268c467 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 25 Aug 2025 15:56:13 -0700 Subject: fbnic: Move phylink resume out of service_task and into open/close The fbnic driver was presenting with the following locking assert coming out of a PM resume: [ 42.208116][ T164] RTNL: assertion failed at drivers/net/phy/phylink.c (2611) [ 42.208492][ T164] WARNING: CPU: 1 PID: 164 at drivers/net/phy/phylink.c:2611 phylink_resume+0x190/0x1e0 [ 42.208872][ T164] Modules linked in: [ 42.209140][ T164] CPU: 1 UID: 0 PID: 164 Comm: bash Not tainted 6.17.0-rc2-virtme #134 PREEMPT(full) [ 42.209496][ T164] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-5.fc42 04/01/2014 [ 42.209861][ T164] RIP: 0010:phylink_resume+0x190/0x1e0 [ 42.210057][ T164] Code: 83 e5 01 0f 85 b0 fe ff ff c6 05 1c cd 3e 02 01 90 ba 33 0a 00 00 48 c7 c6 20 3a 1d a5 48 c7 c7 e0 3e 1d a5 e8 21 b8 90 fe 90 <0f> 0b 90 90 e9 86 fe ff ff e8 42 ea 1f ff e9 e2 fe ff ff 48 89 ef [ 42.210708][ T164] RSP: 0018:ffffc90000affbd8 EFLAGS: 00010296 [ 42.210983][ T164] RAX: 0000000000000000 RBX: ffff8880078d8400 RCX: 0000000000000000 [ 42.211235][ T164] RDX: 0000000000000000 RSI: 1ffffffff4f10938 RDI: 0000000000000001 [ 42.211466][ T164] RBP: 0000000000000000 R08: ffffffffa2ae79ea R09: fffffbfff4b3eb84 [ 42.211707][ T164] R10: 0000000000000003 R11: 0000000000000000 R12: ffff888007ad8000 [ 42.211997][ T164] R13: 0000000000000002 R14: ffff888006a18800 R15: ffffffffa34c59e0 [ 42.212234][ T164] FS: 00007f0dc8e39740(0000) GS:ffff88808f51f000(0000) knlGS:0000000000000000 [ 42.212505][ T164] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 42.212704][ T164] CR2: 00007f0dc8e9fe10 CR3: 000000000b56d003 CR4: 0000000000772ef0 [ 42.213227][ T164] PKRU: 55555554 [ 42.213366][ T164] Call Trace: [ 42.213483][ T164] [ 42.213565][ T164] __fbnic_pm_attach.isra.0+0x8e/0xa0 [ 42.213725][ T164] pci_reset_function+0x116/0x1d0 [ 42.213895][ T164] reset_store+0xa0/0x100 [ 42.214025][ T164] ? pci_dev_reset_attr_is_visible+0x50/0x50 [ 42.214221][ T164] ? sysfs_file_kobj+0xc1/0x1e0 [ 42.214374][ T164] ? sysfs_kf_write+0x65/0x160 [ 42.214526][ T164] kernfs_fop_write_iter+0x2f8/0x4c0 [ 42.214677][ T164] ? kernfs_vma_page_mkwrite+0x1f0/0x1f0 [ 42.214836][ T164] new_sync_write+0x308/0x6f0 [ 42.214987][ T164] ? __lock_acquire+0x34c/0x740 [ 42.215135][ T164] ? new_sync_read+0x6f0/0x6f0 [ 42.215288][ T164] ? lock_acquire.part.0+0xbc/0x260 [ 42.215440][ T164] ? ksys_write+0xff/0x200 [ 42.215590][ T164] ? perf_trace_sched_switch+0x6d0/0x6d0 [ 42.215742][ T164] vfs_write+0x65e/0xbb0 [ 42.215876][ T164] ksys_write+0xff/0x200 [ 42.215994][ T164] ? __ia32_sys_read+0xc0/0xc0 [ 42.216141][ T164] ? do_user_addr_fault+0x269/0x9f0 [ 42.216292][ T164] ? rcu_is_watching+0x15/0xd0 [ 42.216442][ T164] do_syscall_64+0xbb/0x360 [ 42.216591][ T164] entry_SYSCALL_64_after_hwframe+0x4b/0x53 [ 42.216784][ T164] RIP: 0033:0x7f0dc8ea9986 A bit of digging showed that we were invoking the phylink_resume as a part of the fbnic_up path when we were enabling the service task while not holding the RTNL lock. We should be enabling this sooner as a part of the ndo_open path and then just letting the service task come online later. This will help to enforce the correct locking and brings the phylink interface online at the same time as the network interface, instead of at a later time. I tested this on QEMU to verify this was working by putting the system to sleep using "echo mem > /sys/power/state" to put the system to sleep in the guest and then using the command "system_wakeup" in the QEMU monitor. Fixes: 69684376eed5 ("eth: fbnic: Add link detection") Signed-off-by: Alexander Duyck Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/175616257316.1963577.12238158800417771119.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_netdev.c | 4 ++++ drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index e67e99487a27..40581550da1a 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -52,6 +52,8 @@ int __fbnic_open(struct fbnic_net *fbn) fbnic_bmc_rpc_init(fbd); fbnic_rss_reinit(fbd, fbn); + phylink_resume(fbn->phylink); + return 0; time_stop: fbnic_time_stop(fbn); @@ -84,6 +86,8 @@ static int fbnic_stop(struct net_device *netdev) { struct fbnic_net *fbn = netdev_priv(netdev); + phylink_suspend(fbn->phylink, fbnic_bmc_present(fbn->fbd)); + fbnic_down(fbn); fbnic_pcs_free_irq(fbn->fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index a7784deea88f..28e23e3ffca8 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -118,14 +118,12 @@ static void fbnic_service_task_start(struct fbnic_net *fbn) struct fbnic_dev *fbd = fbn->fbd; schedule_delayed_work(&fbd->service_task, HZ); - phylink_resume(fbn->phylink); } static void fbnic_service_task_stop(struct fbnic_net *fbn) { struct fbnic_dev *fbd = fbn->fbd; - phylink_suspend(fbn->phylink, fbnic_bmc_present(fbd)); cancel_delayed_work(&fbd->service_task); } -- cgit v1.2.3 From dac978e51cce0c1f00a14c4a82f81d387f79b2d4 Mon Sep 17 00:00:00 2001 From: Neil Mandir Date: Tue, 26 Aug 2025 10:30:22 -0400 Subject: net: macb: Disable clocks once When the driver is removed the clocks are disabled twice: once in macb_remove and a second time by runtime pm. Disable wakeup in remove so all the clocks are disabled and skip the second call to macb_clks_disable. Always suspend the device as we always set it active in probe. Fixes: d54f89af6cc4 ("net: macb: Add pm runtime support") Signed-off-by: Neil Mandir Co-developed-by: Sean Anderson Signed-off-by: Sean Anderson Link: https://patch.msgid.link/20250826143022.935521-1-sean.anderson@linux.dev Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cadence/macb_main.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 106885451147..16d28a8b3b56 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5404,14 +5404,11 @@ static void macb_remove(struct platform_device *pdev) mdiobus_unregister(bp->mii_bus); mdiobus_free(bp->mii_bus); + device_set_wakeup_enable(&bp->pdev->dev, 0); cancel_work_sync(&bp->hresp_err_bh_work); pm_runtime_disable(&pdev->dev); pm_runtime_dont_use_autosuspend(&pdev->dev); - if (!pm_runtime_suspended(&pdev->dev)) { - macb_clks_disable(bp->pclk, bp->hclk, bp->tx_clk, - bp->rx_clk, bp->tsu_clk); - pm_runtime_set_suspended(&pdev->dev); - } + pm_runtime_set_suspended(&pdev->dev); phylink_destroy(bp->phylink); free_netdev(dev); } -- cgit v1.2.3 From 5189446ba995556eaa3755a6e875bc06675b88bd Mon Sep 17 00:00:00 2001 From: Oscar Maes Date: Wed, 27 Aug 2025 08:23:21 +0200 Subject: net: ipv4: fix regression in local-broadcast routes Commit 9e30ecf23b1b ("net: ipv4: fix incorrect MTU in broadcast routes") introduced a regression where local-broadcast packets would have their gateway set in __mkroute_output, which was caused by fi = NULL being removed. Fix this by resetting the fib_info for local-broadcast packets. This preserves the intended changes for directed-broadcast packets. Cc: stable@vger.kernel.org Fixes: 9e30ecf23b1b ("net: ipv4: fix incorrect MTU in broadcast routes") Reported-by: Brett A C Sheffield Closes: https://lore.kernel.org/regressions/20250822165231.4353-4-bacs@librecast.net Signed-off-by: Oscar Maes Reviewed-by: David Ahern Link: https://patch.msgid.link/20250827062322.4807-1-oscmaes92@gmail.com Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f639a2ae881a..baa43e5966b1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2575,12 +2575,16 @@ static struct rtable *__mkroute_output(const struct fib_result *res, !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); - if (ipv4_is_lbcast(fl4->daddr)) + if (ipv4_is_lbcast(fl4->daddr)) { type = RTN_BROADCAST; - else if (ipv4_is_multicast(fl4->daddr)) + + /* reset fi to prevent gateway resolution */ + fi = NULL; + } else if (ipv4_is_multicast(fl4->daddr)) { type = RTN_MULTICAST; - else if (ipv4_is_zeronet(fl4->daddr)) + } else if (ipv4_is_zeronet(fl4->daddr)) { return ERR_PTR(-EINVAL); + } if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; -- cgit v1.2.3