I’m making an attempt to implement a movement detection algorithm utilizing stateoftheart on an esp32digital camera.
This board has 512kB RAM and I dont need to use an excessive amount of CPU.
So I wished to get a evaluation on what I applied if there’s piece of code which may very well be optimized.
I attempted to remark as a lot as attainable and put hyperlink on current code I took and reshaped.
Context:
I attempt to implement LucasKanade optical stream.
My code consists of :

conv : full convolution 1D.

transpose : rescale the enter vector to 0..255 and transpose the equal array right into a buffer.

LK_optical_flow : The principle code that carry out a 2D convolution with sobel filters and enter pictures. Then compute optical stream magnitude.
Code:
/** Rescale vector to 0..255 and transpose
* @param[in] src vector from convolution unscaled
* @param[out] dst pointer of buffer picture*/
template<typename T>
void transpose(std::vector<T> src, uint8_t *dst, const int w, const int h) {
auto max = *std::max_element(src.start(), src.finish());
auto min = *std::min_element(src.start(), src.finish());
for(int n = 0; n< w * h; n++) {
const int i = n / h;
const int j = n % h;
dst[n] = (uint8_t)(src[w * j + i]  min) * 255.0 / max;
}
}
/** convolution 1D between flattened picture and strel
* from : https://stackoverflow.com/questions/24518989/howtoperform1dimensionalvalidconvolution
* @param f pointer of flattened picture buffer
* @param g structurant ingredient (strel)
* @return convolved picture as vector*/
template<typename T>
std::vector<T> conv(uint8_t *f, const std::vector<T> &g, const int nf) {
int const ng = g.measurement();
int const n = nf + ng  1;
std::vector<T> out(n, T());
for(auto i(0); i < n; ++i) {
int const jmn = (i >= ng  1)? i  (ng  1) : 0;
int const jmx = (i < nf  1)? i : nf  1;
for(auto j(jmn); j <= jmx; ++j)
out[i] += (f[j] * g[i  j]);
}
out.erase(out.start(), out.start() + ng / 2 + 1); // take away edge attributable to full convolution
return out;
}
/// Optical stream LucasKanade
/** Implement LK optical stream supply from wiki:
* https://en.wikipedia.org/wiki/LucaspercentE2%80%93Kanade_method
* @param src1 pointer to grayscale buffer picture prompt t
* @param src2 pointer to grayscale buffer picture diff Picture between t and t+1
* @param output Magnitude output picture in RGB */
void LK_optical_flow(uint8_t *src1, uint8_t *src2, uint8_t *output, int w, int h)
{
//Allocate 1D strel
std::vector<int> Kernel_Dy = {1, 2, 1};
std::vector<int> Kernel_Dx = {1, 0, 1};
std::vector<int> Kernel_Dt = {1, 1, 1};
//Allocate fy solely. An excessive amount of reminiscence on the heap.
std::vector<int> tmp;
uint8_t *fx = src1;
uint8_t *fy = new uint8_t[w * h];
uint8_t *ft = src2;
memset(output, 0, w * h * sizeof(uint8_t));
memcpy(fy, fx, w * h * sizeof(uint8_t));
// Compute equal of 2D convolution decompose of two 1D convolution.
// Sobel Dx
tmp = conv(fx, Kernel_Dx, w*h);
transpose(tmp, fx, w, h);
tmp = conv(fx, Kernel_Dy, w*h);
transpose(tmp, fx, w, h);
// Sobel Dy
tmp = conv(fy, Kernel_Dy, w*h);
transpose(tmp, fy, w, h);
tmp = conv(fy, Kernel_Dx, w*h);
transpose(tmp, fy, w, h);
// Dt
tmp = conv(ft, Kernel_Dt, w*h);
transpose(tmp, ft, w, h);
tmp = conv(ft, Kernel_Dt, w*h);
transpose(tmp, ft, w, h);
std::vector<int>().swap(tmp); // deallocate tmp
//TODO: Create a perform for all above : Magazine = opticalflow(fx, fy, ft, window=3)
const int window = 3; //half window measurement
float AtA[2][2];
float Atb[2];
std::vector<unsigned> Magazine(w*h);
// Lucas Kanade optical stream algorithm
for(int i=window; i<=wwindow;++i){
for(int j=window; j<hwindow;++j){
memset(Atb, 0, sizeof(float) * 2);
memset(AtA, 0, sizeof(float) * 4);
for(int m=window; m<window;++m){
const unsigned index = (j + m) * w + (i + m);
const float Ix = (float) fx[index];
const float Iy = (float) fy[index];
const float It = (float) ft[index];
AtA[0][0] += Ix * Ix;
AtA[1][1] += Iy * Iy;
AtA[0][1] += Ix * Iy;
AtA[1][0] = AtA[0][1];
Atb[0] +=  Ix * It;
Atb[1] +=  Iy * It;
}
//Compute inverse of 2x2 array AtA: 1/(adbc)[[d b][c a]]
const float det = AtA[0][0] * AtA[1][1]  AtA[0][1] * AtA[1][0];
const float iAtA[2][2] = {
{AtA[1][1] / det,  AtA[0][1] / det},
{iAtA[0][1] , AtA[0][0] / det}
};
//Compute optical stream : [Vx Vy] = inv[AtA] . Atb
const float Vx = iAtA[0][0] * Atb[0] + iAtA[0][1] * Atb[1];
const float Vy = iAtA[1][0] * Atb[0] + iAtA[1][1] * Atb[1];
Magazine[i + j * w] = hypotf(Vx, Vy); //sqrt(Vx²+Vy²)
}
}
delete [] fy;
int max = *std::max_element(Magazine.start(), Magazine.finish());
if(max == 0)
return;
ESP_LOGI(TAG, "maxMag = %i n", max);
//compute output which is Magazine rescaled nothing fascinating right here.
}
My subsequent step could be to finish the TODO feedback.
Thanks