Perché il mio codice è molto più lento di opencv per un semplice algoritmo StereoBM?

Questo è il mio codice di test per l'implementazione di un semplice algoritmo testBM, senza prefiltraggio. Ma ci vogliono circa 400 ms o anche più quando le dimensioni della finestra sono maggiori mentre lo StereoBM di opencv (CPU non GPU) richiede 20 ms. Ho controllato la fonte di StereoBM ma è difficile per me capirlo. C'è qualcuno che sa perché?Perché il mio codice è molto più lento di opencv per un semplice algoritmo StereoBM?

E di seguito è il mio codice.

void testBM(const Mat &left0, 
      const Mat &right0, 
      Mat &disparity, 
      int SAD, 
      int searchRange) 
{ 
    int cols = left0.cols; 
    int rows = left0.rows; 
    int total = cols*rows; 
    const uchar* data_left = left0.ptr<uchar>(0); 
    const uchar* data_right = right0.ptr<uchar>(0); 
    uchar* data_dm = new uchar[total]; 
    int dbNum = 2 * SAD + 1; 
    int dNum = dbNum * dbNum; 
    //x is col index in the dbNum * dbNum window 
    //y is row index in this window 
    //z is (x + y * cols). 
    //I compute them in advance for avoid computing repeatedly. 
    Point3i *dLocDif = new Point3i[dNum]; 
    for (int i = 0; i < dNum; i++) 
    { 
     dLocDif[i] = Point3i(i%dbNum - SAD, i/dbNum - SAD, 0); 
     dLocDif[i].z = dLocDif[i].x + dLocDif[i].y * cols; 
    } 

    //I compute disparity difference for eache search range to avoid 
    //computing repeatedly. 
    uchar* dif_ = new uchar[total*searchRange]; 
    for (int _search = 0; _search < searchRange; _search++) 
    { 
     int th = _search * total; 
     for (int i = 0; i < total; i++) 
     { 
      int c = i % cols - _search; 
      if (c < 0) continue; 
      dif_[i+th] = (uchar)std::abs(data_left[i] - data_right[i-_search]); 
     } 
    } 
    for (int p = 0; p < total; p++) 
    { 
     int min = 50 * dNum; 
     int dm = -256; 
     int _col = p % cols; 
     int _row = p/cols; 
     int th = 0; 

     //I search for the smallest difference between left and right image 
     // using def_. 
     for (int _search = 0; _search < searchRange; _search++, th += total) 
     { 
      if (_col + _search > cols) break; 
      int temp = 0; 
      for (int i = 0; i < dNum; i++) 
      { 
       int _c = _col + dLocDif[i].x; 
       if (_c >= cols || _c < 0) continue; 
       int _r = _row + dLocDif[i].y; 
       if (_r >= rows || _r < 0) continue; 
       temp += dif_[th + p + dLocDif[i].z]; 
       if (temp > min) 
       { 
        break; 
       } 
      } 
      if (temp < min) 
      { 
       dm = _search; 
       min = temp; 
      } 
     } 
     data_dm[p] = dm; 
    } 
    disparity = Mat(rows, cols, CV_8UC1, data_dm); 
}

Ecco il codice sorgente essenziale di StereoBM in opencv. Sono un po 'confuso dopo l'inizializzazione. Qualcuno potrebbe spiegarci brevemente?

static void 
findStereoCorrespondenceBM(const Mat& left, const Mat& right, 
          Mat& disp, Mat& cost, const CvStereoBMState& state, 
          uchar* buf, int _dy0, int _dy1) 
{ 
    const int ALIGN = 16; 
    int x, y, d; 
    int wsz = state.SADWindowSize, wsz2 = wsz/2; 
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1); 
    int ndisp = state.numberOfDisparities; 
    int mindisp = state.minDisparity; 
    int lofs = MAX(ndisp - 1 + mindisp, 0); 
    int rofs = -MIN(ndisp - 1 + mindisp, 0); 
    int width = left.cols, height = left.rows; 
    int width1 = width - rofs - ndisp + 1; 
    int ftzero = state.preFilterCap; 
    int textureThreshold = state.textureThreshold; 
    int uniquenessRatio = state.uniquenessRatio; 
    short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT); 

    int *sad, *hsad0, *hsad, *hsad_sub, *htext; 
    uchar *cbuf0, *cbuf; 
    const uchar* lptr0 = left.data + lofs; 
    const uchar* rptr0 = right.data + rofs; 
    const uchar *lptr, *lptr_sub, *rptr; 
    short* dptr = (short*)disp.data; 
    int sstep = (int)left.step; 
    int dstep = (int)(disp.step/sizeof(dptr[0])); 
    int cstep = (height+dy0+dy1)*ndisp; 
    int costbuf = 0; 
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0; 
    const int TABSZ = 256; 
    uchar tab[TABSZ]; 

    sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN); 
    hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN); 
    htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN); 
    cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN); 

    for(x = 0; x < TABSZ; x++) 
     tab[x] = (uchar)std::abs(x - ftzero); 

    // initialize buffers 
    memset(hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0])); 
    memset(htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0])); 

    for(x = -wsz2-1; x < wsz2; x++) 
    { 
     hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp; 
     lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep; 
     rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-1) - dy0*sstep; 

     for(y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep) 
     { 
      int lval = lptr[0]; 
      for(d = 0; d < ndisp; d++) 
      { 
       int diff = std::abs(lval - rptr[d]); 
       cbuf[d] = (uchar)diff; 
       hsad[d] = (int)(hsad[d] + diff); 
      } 
      htext[y] += tab[lval]; 
     } 
    } 

    // initialize the left and right borders of the disparity map 
    for(y = 0; y < height; y++) 
    { 
     for(x = 0; x < lofs; x++) 
      dptr[y*dstep + x] = FILTERED; 
     for(x = lofs + width1; x < width; x++) 
      dptr[y*dstep + x] = FILTERED; 
    } 
    dptr += lofs; 

    for(x = 0; x < width1; x++, dptr++) 
    { 
     int* costptr = cost.data ? (int*)cost.data + lofs + x : &costbuf; 
     int x0 = x - wsz2 - 1, x1 = x + wsz2; 
     const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp; 
     cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp; 
     hsad = hsad0 - dy0*ndisp; 
     lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep; 
     lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep; 
     rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep; 

     for(y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp, 
      hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep) 
     { 
      int lval = lptr[0]; 
      for(d = 0; d < ndisp; d++) 
      { 
       int diff = std::abs(lval - rptr[d]); 
       cbuf[d] = (uchar)diff; 
       hsad[d] = hsad[d] + diff - cbuf_sub[d]; 
      } 
      htext[y] += tab[lval] - tab[lptr_sub[0]]; 
     } 

     // fill borders 
     for(y = dy1; y <= wsz2; y++) 
      htext[height+y] = htext[height+dy1-1]; 
     for(y = -wsz2-1; y < -dy0; y++) 
      htext[y] = htext[-dy0]; 

     // initialize sums 
     for(d = 0; d < ndisp; d++) 
      sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0)); 

     hsad = hsad0 + (1 - dy0)*ndisp; 
     for(y = 1 - dy0; y < wsz2; y++, hsad += ndisp) 
      for(d = 0; d < ndisp; d++) 
       sad[d] = (int)(sad[d] + hsad[d]); 
     int tsum = 0; 
     for(y = -wsz2-1; y < wsz2; y++) 
      tsum += htext[y]; 

     // finally, start the real processing 
     for(y = 0; y < height; y++) 
     { 
      int minsad = INT_MAX, mind = -1; 
      hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp; 
      hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp; 

      for(d = 0; d < ndisp; d++) 
      { 
       int currsad = sad[d] + hsad[d] - hsad_sub[d]; 
       sad[d] = currsad; 
       if(currsad < minsad) 
       { 
        minsad = currsad; 
        mind = d; 
       } 
      } 
      tsum += htext[y + wsz2] - htext[y - wsz2 - 1]; 
      if(tsum < textureThreshold) 
      { 
       dptr[y*dstep] = FILTERED; 
       continue; 
      } 

      if(uniquenessRatio > 0) 
      { 
       int thresh = minsad + (minsad * uniquenessRatio/100); 
       for(d = 0; d < ndisp; d++) 
       { 
        if(sad[d] <= thresh && (d < mind-1 || d > mind+1)) 
         break; 
       } 
       if(d < ndisp) 
       { 
        dptr[y*dstep] = FILTERED; 
        continue; 
       } 
      } 

      { 
      sad[-1] = sad[1]; 
      sad[ndisp] = sad[ndisp-2]; 
      int p = sad[mind+1], n = sad[mind-1]; 
      d = p + n - 2*sad[mind] + std::abs(p - n); 
      dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4); 
      costptr[y*coststep] = sad[mind]; 
      } 
     } 
    } 
}

fonte

2015-04-26 Seven

Quindi vuol utilizzare la maggior parte del tempo nel ciclo 'p' o nella chiamata al Mat (...)? – Surt

nel ciclo 'p'. – Seven

Ho controllato il dispendio di tempo di ogni passaggio. Prima del ciclo 'p', ci vogliono circa 70 ms e il ciclo 'p' usa 320 ms. – Seven

OpenCV esegue molti algoritmi in parallelo; parallel_for/do abstract dei backend TBB, PPL e OpenMP.

L'immagine originale è suddivisa in sottoregioni e findStereoCorrespondenceBM() viene eseguita per ciascuna di queste sottoregioni. Ciò è possibile con l'interfaccia che vediamo dal cv::Mat può essere usato come una vista ad una sotto-immagine senza essere richiesto per copiare i dati reali dei pixel. È possibile verificare ciò osservando i processori in uso (ad es. Con explorer di processo su windows o in alto su unix) durante l'esecuzione del programma.

(originariamente postato da Hauke Heibel come commento)

fonte

2015-10-17 09:25:15

Perché il mio codice è molto più lento di opencv per un semplice algoritmo StereoBM?

risposta

Problemi correlati