Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mandelbrot benchmark's SIMD doesn't give the same output as the scalar version. #26

Open
Panhaolin2001 opened this issue Nov 6, 2024 · 3 comments
Assignees

Comments

@Panhaolin2001
Copy link

#include "VecCore/VecCore"

using namespace vecCore;

template<typename T>
void mandelbrot(T xmin, T xmax, size_t nx,
                T ymin, T ymax, size_t ny,
                size_t max_iter, unsigned char *image)
{
    T dx = (xmax - xmin) / T(nx);
    T dy = (ymax - ymin) / T(ny);

    for (size_t i = 0; i < nx; ++i) {
        for (size_t j = 0; j < ny; ++j) {
            size_t k = 0;
            T x = xmin + T(i) * dx, cr = x, zr = x;
            T y = ymin + T(j) * dy, ci = y, zi = y;

            do {
                x  = zr*zr - zi*zi + cr;
                y  = T(2.0) * zr*zi + ci;
                zr = x;
                zi = y;
            } while (++k < max_iter && (zr*zr + zi*zi < T(4.0)));

            image[ny*i + j] = k;
        }
    }
}

template<typename T>
void mandelbrot_v(Scalar<T> xmin, Scalar<T> xmax, size_t nx,
                  Scalar<T> ymin, Scalar<T> ymax, size_t ny,
                  Scalar<Index<T>> max_iter, unsigned char *image)
{
    T iota;
    for (size_t i = 0; i < VectorSize<T>(); ++i)
        Set<T>(iota, i, i);

    T dx = T(xmax - xmin) / T((Scalar<T>)nx);
    T dy = T(ymax - ymin) / T((Scalar<T>)ny), dyv = iota * dy;

    for (size_t i = 0; i < nx; ++i) {
        for (size_t j = 0; j < ny; j += VectorSize<T>()) {
            Scalar<Index<T>> k(0);
            T x = xmin + T((Scalar<T>)i) * dx,       cr = x, zr = x;
            T y = ymin + T((Scalar<T>)j) * dy + dyv, ci = y, zi = y;

            Index<T> kv(0);
            Mask<T> m(true);

            do {
                x = zr*zr - zi*zi + cr;
                y = T((Scalar<T>)2.0) * zr*zi + ci;
                MaskedAssign<T>(zr, m, x);
                MaskedAssign<T>(zi, m, y);
                MaskedAssign<Index<T>>(kv, m, ++k);
                m = zr*zr + zi*zi < T((Scalar<T>)4.0);
            } while (k < max_iter && !MaskEmpty(m));

            for (size_t k = 0; k < VectorSize<T>(); ++k)
                image[ny*i + j + k] = (unsigned char) Get(kv, k);
        }
    }
}


int main(){
    double xmin = -2.1, xmax = 1.1;
    double ymin = -1.35, ymax = 1.35;

    size_t nx = 1024, ny = 864, max_iter = 500;
    unsigned char *image_scalar = new unsigned char[nx*ny];
    unsigned char *image_simd = new unsigned char[nx*ny];

    mandelbrot_v<backend::SIMDNative::Double_v>(xmin, xmax, nx, ymin, ymax, ny,
                            max_iter, image_simd);
    mandelbrot<double>(xmin, xmax, nx, ymin, ymax, ny,
                            max_iter, image_scalar);
    
    for(int i=0 ; i<nx*ny; ++i){
        assert(image_simd[i] == image_scalar[i]);
    }

    delete[] image_scalar;
    delete[] image_simd;
}
@Panhaolin2001
Copy link
Author

My compiler version is clang 16.0.6, and the compile options are -march=native -O2 -std=c++2a

@amadio
Copy link
Member

amadio commented Nov 19, 2024

What hardware and ISA are you running on? It's expected that the SIMD and non-SIMD codes give outputs with minor differences. I checked on my machine with AVX2 and compared the output images and the differences are very minor. Only the intrinsics version of the benchmark has a bug that needs to be fixed.

@amadio amadio self-assigned this Nov 19, 2024
@amadio
Copy link
Member

amadio commented Nov 19, 2024

Here are attached images for my test:

float scalar:
mandelbrot_float

float std::simd:
mandelbrot_std_simd_native_float

differences highlighted:
diff

With double you'd get less differences. Cheers,

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants