NVME Doorbell 寄存器数据请求时doorbell 处理

3.NVMe寄存器配置
3.1 寄存器定义
NVMe寄存器主要分为两部分，一部分定义了Controller整体属性，一部分用来存放每组队列的头尾DB寄存器。

CAP——控制器能力，定义了内存页大小的最大最小值、支持的I/O指令集、DB寄存器步长、等待时间界限、仲裁机制、队列是否物理上连续、队列大小；
VS——版本号，定义了控制器实现NVMe协议的版本号；
INTMS——中断掩码，每个bit对应一个中断向量，使用MSI-X中断时，此寄存器无效；
INTMC——中断有效，每个bit对应一个中断向量，使用MSI-X中断时，此寄存器无效；
CC——控制器配置，定义了I/O SQ和CQ队列元素大小、关机状态提醒、仲裁机制、内存页大小、支持的I/O指令集、使能；
CSTS——控制器状态，包括关机状态、控制器致命错误、就绪状态；
AQA——Admin 队列属性，包括SQ大小和CQ大小；
ASQ——Admin SQ基地址；
ACQ——Admin CQ基地址；
1000h之后的寄存器定义了队列的头、尾DB寄存器。
3.2寄存器理解
CAP寄存器标识的是Controller具有多少能力，而CC寄存器则是指当前Controller选择了哪些能力，可以理解为CC是CAP的一个子集；如果重启（reset）的话，可以更换CC配置；
CC.EN置一，表示Controller已经可以开始处理NVM命令，从1到0表示Controller重启；
CC.EN与CSTS.RDY关系密切，CSTS.RDY总是在CC.EN之后由Controller改变，其他不符合执行顺序的操作都将产生未定义的行为；
Admin队列有host直接创建，AQA、ASQ、ACQ三个寄存器标识了Admin队列，而其他I/O队列则有Admin命令创建（eg，创建I/O CQ命令）；
Admin队列的头、尾DB寄存器标识为0，其他I/O队列标识由host按照一定规则分配；只有16bit的有效位，是因为队列深度最大64K。
实际的物理设备CAP.DSTRD值为0，dev->db_stride为1，之后分析中默认db_stride为1

原文链接：https://blog.csdn.net/qq_39021670/article/details/114896973

由dev->dbs使用方式可知，每一个DB寄存器对，前4个字节为SQ Tail DB，后四个字节为CQ Head DB

/*
 * Write sq tail if we are asked to, or if the next command would wrap.
 */
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
{
        if (!write_sq) {
                u16 next_tail = nvmeq->sq_tail + 1;

                if (next_tail == nvmeq->q_depth)
                        next_tail = 0;
                if (next_tail != nvmeq->last_sq_tail)
                        return;
        }

        if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
                        nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
       //前4字节写入sq tial
                writel(nvmeq->sq_tail, nvmeq->q_db);
        nvmeq->last_sq_tail = nvmeq->sq_tail;
}

static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
{
        u16 head = nvmeq->cq_head;
         //后4字节写入 cq head

        if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
                                              nvmeq->dbbuf_cq_ei))
                writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
}

static irqreturn_t nvme_irq(int irq, void *data)
{
        struct nvme_queue *nvmeq = data;
        irqreturn_t ret = IRQ_NONE;
        u16 start, end;

        /*
         * The rmb/wmb pair ensures we see all updates from a previous run of
         * the irq handler, even if that was on another CPU.
         */
        rmb();
        if (nvmeq->cq_head != nvmeq->last_cq_head)
                ret = IRQ_HANDLED;
        //找到当前CQ队列的尾部，并更新cq_head
        nvme_process_cq(nvmeq, &start, &end, -1);
        nvmeq->last_cq_head = nvmeq->cq_head;
        wmb();

        if (start != end) {
           // 依次处理CQ队列中的请求
                nvme_complete_cqes(nvmeq, start, end);
                return IRQ_HANDLED;
        }

        return ret;
}

依次取出ssd 中已经返回的数据，然后写入cq 的head 到Doorbell 寄存器

static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
                                  u16 *end, unsigned int tag)
{
        int found = 0;

        *start = nvmeq->cq_head;
        while (nvme_cqe_pending(nvmeq)) {
                if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
                        found++;
                nvme_update_cq_head(nvmeq);
        }
        *end = nvmeq->cq_head;

        if (*start != *end)
                nvme_ring_cq_doorbell(nvmeq);
        return found;
}

static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
{
        u16 head = nvmeq->cq_head;

        if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
                                              nvmeq->dbbuf_cq_ei))
                writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
}

依次处理cq 中的数据返回给block 层

static inline void nvme_end_request(struct request *req, __le16 status,
                union nvme_result result)
{
        struct nvme_request *rq = nvme_req(req);

        rq->status = le16_to_cpu(status) >> 1;
        rq->result = result;
        /* inject error when permitted by fault injection framework */
        nvme_should_fail(req);
       //block 请求返回
        blk_mq_complete_request(req);
}

static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{
        volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
        struct request *req;

        /*
         * AEN requests are special as they don't time out and can
         * survive any kind of queue freeze and often don't respond to
         * aborts.  We don't even bother to allocate a struct request
         * for them but rather special case them here.
         */
        if (unlikely(nvmeq->qid == 0 &&
                        cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
                nvme_complete_async_event(&nvmeq->dev->ctrl,
                                cqe->status, &cqe->result);
                return;
        }
        //将通过tag 将reqeust 转换出来
        req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
        if (unlikely(!req)) {
                dev_warn(nvmeq->dev->ctrl.device,
                        "invalid id %d completed on queue %d\n",
                        cqe->command_id, le16_to_cpu(cqe->sq_id));
                return;
        }

        trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
        nvme_end_request(req, cqe->status, cqe->result);
}


static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
{
        while (start != end) {
                nvme_handle_cqe(nvmeq, start);
                if (++start == nvmeq->q_depth)
                        start = 0;
        }
}

static const struct blk_mq_ops nvme_mq_admin_ops = {
        .queue_rq       = nvme_queue_rq,
        .complete       = nvme_pci_complete_rq,
        .init_hctx      = nvme_admin_init_hctx,
        .init_request   = nvme_init_request,
        .timeout        = nvme_timeout,
};

static const struct blk_mq_ops nvme_mq_ops = {
        .queue_rq       = nvme_queue_rq,
        .complete       = nvme_pci_complete_rq,
        .commit_rqs     = nvme_commit_rqs,
        .init_hctx      = nvme_init_hctx,
        .init_request   = nvme_init_request,
        .map_queues     = nvme_pci_map_queues,
        .timeout        = nvme_timeout,
        .poll           = nvme_poll,
};

admin queue

nvme_queue_rq

io queue

nvme_queue_rq

nvme_commit_rqs

static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                                            struct request *rq,
                                            blk_qc_t *cookie, bool last)
{
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .last = last,
        };
        blk_qc_t new_cookie;
        blk_status_t ret;

        new_cookie = request_to_qc_t(hctx, rq);

        /*
         * For OK queue, we are done. For error, caller may kill it.
         * Any other error (busy), just add it to our list as we
         * previously would have done.
         */
        ret = q->mq_ops->queue_rq(hctx, &bd);
        switch (ret) {
        case BLK_STS_OK:
                blk_mq_update_dispatch_busy(hctx, false);
                *cookie = new_cookie;
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
                blk_mq_update_dispatch_busy(hctx, true);
                __blk_mq_requeue_request(rq);
                break;
        default:
                blk_mq_update_dispatch_busy(hctx, false);
                *cookie = BLK_QC_T_NONE;
                break;
        }

        return ret;
}


 */
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                             bool got_budget)
{
        struct blk_mq_hw_ctx *hctx;
        struct request *rq, *nxt;
        bool no_tag = false;
        int errors, queued;
        blk_status_t ret = BLK_STS_OK;
        bool no_budget_avail = false;

        if (list_empty(list))
                return false;

        WARN_ON(!list_is_singular(list) && got_budget);

        /*
         * Now process all the entries, sending them to the driver.
         */
        errors = queued = 0;
        do {
                struct blk_mq_queue_data bd;

                rq = list_first_entry(list, struct request, queuelist);

                hctx = rq->mq_hctx;
                if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
                        blk_mq_put_driver_tag(rq);
                        no_budget_avail = true;
                        break;
                }

                if (!blk_mq_get_driver_tag(rq)) {
                        /*
                         * The initial allocation attempt failed, so we need to
                         * rerun the hardware queue when a tag is freed. The
                         * waitqueue takes care of that. If the queue is run
                         * before we add this entry back on the dispatch list,
                         * we'll re-run it below.
                         */
                        if (!blk_mq_mark_tag_wait(hctx, rq)) {
                                blk_mq_put_dispatch_budget(hctx);
                                /*
                                 * For non-shared tags, the RESTART check
                                 * will suffice.
                                 */
                                if (hctx->flags & BLK_MQ_F_TAG_SHARED)
                                        no_tag = true;
                                break;
                        }
                }

                list_del_init(&rq->queuelist);

                bd.rq = rq;

                /*
                 * Flag last if we have no more requests, or if we have more
                 * but can't assign a driver tag to it.
                 */
                if (list_empty(list))
                        bd.last = true;
                else {
                        nxt = list_first_entry(list, struct request, queuelist);
                        bd.last = !blk_mq_get_driver_tag(nxt);
                }
                //下发io
                ret = q->mq_ops->queue_rq(hctx, &bd);
                if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
                        blk_mq_handle_dev_resource(rq, list);
                        break;
                }

                if (unlikely(ret != BLK_STS_OK)) {
                        errors++;
                        blk_mq_end_request(rq, BLK_STS_IOERR);
                        continue;
                }

                queued++;
        } while (!list_empty(list));

        hctx->dispatched[queued_to_index(queued)]++;

        /*
         * Any items that need requeuing? Stuff them into hctx->dispatch,
         * that is where we will continue on next queue run.
         */
        if (!list_empty(list)) {
                bool needs_restart;

                /*
                 * If we didn't flush the entire list, we could have told
                 * the driver there was more coming, but that turned out to
                 * be a lie.
                 */
                if (q->mq_ops->commit_rqs)
                       //nvme io commit

                        q->mq_ops->commit_rqs(hctx);

                spin_lock(&hctx->lock);
                list_splice_tail_init(list, &hctx->dispatch);
                spin_unlock(&hctx->lock);

                /*
                 * Order adding requests to hctx->dispatch and checking
                 * SCHED_RESTART flag. The pair of this smp_mb() is the one
                 * in blk_mq_sched_restart(). Avoid restart code path to
                 * miss the new added requests to hctx->dispatch, meantime
                 * SCHED_RESTART is observed here.
                 */
                smp_mb();

                /*
                 * If SCHED_RESTART was set by the caller of this function and
                 * it is no longer set that means that it was cleared by another
                 * thread and hence that a queue rerun is needed.
                 *
                 * If 'no_tag' is set, that means that we failed getting
                 * a driver tag with an I/O scheduler attached. If our dispatch
                 * waitqueue is no longer active, ensure that we run the queue
                 * AFTER adding our entries back to the list.
                 *
                 * If no I/O scheduler has been configured it is possible that
                 * the hardware queue got stopped and restarted before requests
                 * were pushed back onto the dispatch list. Rerun the queue to
                 * avoid starvation. Notes:
                 * - blk_mq_run_hw_queue() checks whether or not a queue has
                 *   been stopped before rerunning a queue.
                 * - Some but not all block drivers stop a queue before
                 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
                 *   and dm-rq.
                 *
                 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                 * bit is set, run queue after a delay to avoid IO stalls
                 * that could otherwise occur if the queue is idle.  We'll do
                 * similar if we couldn't get budget and SCHED_RESTART is set.
                 */
                needs_restart = blk_mq_sched_needs_restart(hctx);
                if (!needs_restart ||
                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);
                else if (needs_restart && (ret == BLK_STS_RESOURCE ||
                                           no_budget_avail))
                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);

                blk_mq_update_dispatch_busy(hctx, true);
                return false;
        } else
                blk_mq_update_dispatch_busy(hctx, false);

        /*
         * If the host/device is unable to accept more work, inform the
         * caller of that.
         */
        if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
                return false;

        return (queued + errors) != 0;
}