diff --git a/qpms/qpms_c.pyx b/qpms/qpms_c.pyx index de8dab3..be0930a 100644 --- a/qpms/qpms_c.pyx +++ b/qpms/qpms_c.pyx @@ -189,6 +189,15 @@ cdef extern from "translations.h": size_t deststride, size_t srcstride, double kdlj_r, double kdlj_theta, double kdlj_phi, int r_ge_d, int J) nogil + int qpms_cython_trans_calculator_get_AB_arrays_loop(qpms_trans_calculator *c, + int J, int resnd, + int daxis, int saxis, + char *A_data, np.npy_intp *A_shape, np.npy_intp *A_strides, + char *B_data, np.npy_intp *B_shape, np.npy_intp *B_strides, + char *r_data, np.npy_intp *r_shape, np.npy_intp *r_strides, + char *theta_data, np.npy_intp *theta_shape, np.npy_intp *theta_strides, + char *phi_data, np.npy_intp *phi_shape, np.npy_intp *phi_strides, + char *r_ge_d_data, np.npy_intp *phi_shape, np.npy_intp *phi_strides) nogil @@ -629,7 +638,7 @@ cdef class trans_calculator: print(baseshape) print(len(baseshape), resnd,smallaxis, bigaxis) print(r.shape, theta.shape,phi.shape,r_ge_d.shape) - #''' + ''' cdef int longest_axis = 0 # FIxME: the whole thing with longest_axis will fail if none is longer than 1 for i in range(resnd): @@ -641,7 +650,7 @@ cdef class trans_calculator: for i in range(resnd): innerloop_shape[i] = resultshape[i] innerloop_shape[longest_axis] = 1 # longest axis will be iterated in the outer (parallelized) loop. Therefore, longest axis, together with saxis and daxis, will not be iterated in the inner loop - #''' + ''' resultshape[daxis] = self.c[0].nelem resultshape[saxis] = self.c[0].nelem cdef np.ndarray r_c = np.broadcast_to(r,resultshape) @@ -652,11 +661,22 @@ cdef class trans_calculator: cdef np.ndarray b = np.empty(resultshape, dtype=complex) dstride = a.strides[daxis] sstride = a.strides[saxis] - longstride = a.strides[longest_axis] - if innerloop_shape[daxis] != 1: raise - if innerloop_shape[saxis] != 1: raise + #longstride = a.strides[longest_axis] + #if innerloop_shape[daxis] != 1: raise + #if innerloop_shape[saxis] != 1: raise # TODO write this in C (as a function) and parallelize there with nogil: #, parallel(): # FIXME rewrite this part in C + errval = qpms_cython_trans_calculator_get_AB_arrays_loop( + self.c, J, resnd, + daxis, saxis, + a.data, a.shape, a.strides, + b.data, b.shape, b.strides, + r_c.data, r_c.shape, r_c.strides, + theta_c.data, theta_c.shape, theta_c.strides, + phi_c.data, phi_c.shape, phi_c.strides, + r_ge_d_c.data, r_ge_d_c.shape, r_ge_d_c.strides + ) + """ local_indices = calloc(resnd, sizeof(int)) if local_indices == NULL: abort() for longi in range(a.shape[longest_axis]): # outer loop (to be parallelized) @@ -702,5 +722,6 @@ cdef class trans_calculator: ''' free(local_indices) free(innerloop_shape) + """ return a, b # TODO make possible to access the attributes (to show normalization etc) diff --git a/qpms/translations.c b/qpms/translations.c index 485a26d..c62eb11 100644 --- a/qpms/translations.c +++ b/qpms/translations.c @@ -562,9 +562,10 @@ int qpms_trans_calculator_get_AB_arrays_buf(const qpms_trans_calculator *c, costheta,-1,legendre_buf)) abort(); if (qpms_sph_bessel_array(J, 2*c->lMax+2, kdlj.r, bessel_buf)) abort(); size_t desti = 0, srci = 0; - for (int n = 1; n <= c->nelem; ++n) for (int m = -n; m <= n; ++m) { - for (int nu = 1; nu <= c->nelem; ++nu) for (int mu = -nu; mu <= nu; ++mu) { - assert(qpms_trans_calculator_index_mnmunu(c,m,n,mu,nu) == desti*c->nelem + srci); + for (int n = 1; n <= c->lMax; ++n) for (int m = -n; m <= n; ++m) { + for (int nu = 1; nu <= c->lMax; ++nu) for (int mu = -nu; mu <= nu; ++mu) { + size_t assertindex = qpms_trans_calculator_index_mnmunu(c,m,n,mu,nu); + assert(assertindex == desti*c->nelem + srci); *(Adest + deststride * desti + srcstride * srci) = qpms_trans_calculator_get_A_precalcbuf(c,m,n,mu,nu, kdlj,r_ge_d,J,bessel_buf,legendre_buf); @@ -574,6 +575,7 @@ int qpms_trans_calculator_get_AB_arrays_buf(const qpms_trans_calculator *c, ++srci; } ++desti; + srci = 0; } return 0; } @@ -660,4 +662,107 @@ int qpms_trans_calculator_get_AB_arrays_ext(const qpms_trans_calculator *c, return qpms_trans_calculator_get_AB_arrays(c,Adest,Bdest,deststride,srcstride, kdlj, r_ge_d, J); } +#ifdef QPMS_COMPILE_PYTHON_EXTENSIONS +#include +int qpms_cython_trans_calculator_get_AB_arrays_loop( + const qpms_trans_calculator *c, const qpms_bessel_t J, const int resnd, + const int daxis, const int saxis, + char *A_data, const npy_intp *A_shape, const npy_intp *A_strides, + char *B_data, const npy_intp *B_shape, const npy_intp *B_strides, + const char *r_data, const npy_intp *r_shape, const npy_intp *r_strides, + const char *theta_data, const npy_intp *theta_shape, const npy_intp *theta_strides, + const char *phi_data, const npy_intp *phi_shape, const npy_intp *phi_strides, + const char *r_ge_d_data, const npy_intp *r_ge_d_shape, const npy_intp *r_ge_d_strides){ + assert(daxis != saxis); + assert(resnd >= 2); + int longest_axis = 0; + const npy_intp *resultshape = A_shape, *resultstrides = A_strides; + // TODO put some restrict's everywhere? + for (int ax = 0; ax < resnd; ++ax){ + assert(A_shape[ax] == B_shape[ax]); + assert(A_strides[ax] == B_strides[ax]); + if (A_shape[ax] > A_shape[longest_axis]) longest_axis = ax; + } + const npy_intp longlen = resultshape[longest_axis]; + + npy_intp innerloop_shape[resnd]; + for (int ax = 0; ax < resnd; ++ax) { + innerloop_shape[ax] = resultshape[ax]; + } + /* longest axis will be iterated in the outer (parallelized) loop. + * Therefore, longest axis, together with saxis and daxis, + * will not be iterated in the inner loop: + */ + innerloop_shape[longest_axis] = 1; + innerloop_shape[daxis] = 1; + innerloop_shape[saxis] = 1; + + // these are the 'strides' passed to the qpms_trans_calculator_get_AB_arrays_ext + // function, which expects 'const double *' strides, not 'char *' ones. + const npy_intp dstride = resultstrides[daxis] / sizeof(complex double); + const npy_intp sstride = resultstrides[saxis] / sizeof(complex double); + + // TODO here start parallelisation + npy_intp local_indices[resnd]; + memset(local_indices, 0, sizeof(local_indices)); + int errval_local = 0; + + for(npy_intp longi = 0; longi < longlen; ++longi) { + // this might be done also in the inverse order, but this is more + // 'c-contiguous' way of incrementing the indices + int ax = resnd - 1; + while(ax >= 0) { + /* calculate the correct index/pointer for each array used. + * This can be further optimized from O(resnd * total size of + * the result array) to O(total size of the result array), but + * fick that now + */ + const char *r_p = r_data + r_strides[longest_axis] * longi; + const char *theta_p = theta_data + theta_strides[longest_axis] * longi; + const char *phi_p = phi_data + phi_strides[longest_axis] * longi; + const char *r_ge_d_p = r_ge_d_data + r_ge_d_strides[longest_axis] * longi; + char *A_p = A_data + A_strides[longest_axis] * longi; + char *B_p = B_data + B_strides[longest_axis] * longi; + for(int i = 0; i < resnd; ++i) { + // following two lines are probably not needed, as innerloop_shape is there 1 anyway + // so if i == daxis, saxis, or longest_axis, local_indices[i] is zero. + if (i == longest_axis) continue; + if (daxis == i || saxis == i) continue; + r_p += r_strides[i] * local_indices[i]; + theta_p += theta_strides[i] * local_indices[i]; + phi_p += phi_strides[i] * local_indices[i]; + A_p += A_strides[i] * local_indices[i]; + B_p += B_strides[i] * local_indices[i]; + } + + // perform the actual task here + errval_local |= qpms_trans_calculator_get_AB_arrays_ext(c, (complex double *)A_p, + (complex double *)B_p, + dstride, sstride, + // FIXME change all the _ext function types to npy_... so that + // these casts are not needed + *((double *) r_p), *((double *) theta_p), *((double *)phi_p), + (int)(*((npy_bool *) r_ge_d_p)), J); + if (errval_local) abort(); + + // increment the last index 'digit' (ax is now resnd-1; we don't have do-while loop in python) + ++local_indices[ax]; + while(local_indices[ax] == innerloop_shape[ax] && ax >= 0) { + // overflow to the next digit but stop when reached below the last one + local_indices[ax] = 0; + local_indices[--ax]++; + } + if (ax >= 0) // did not overflow, get back to the lowest index + ax = resnd - 1; + } + } + // FIXME when parallelizing + int errval = errval_local; + // TODO Here end parallelisation + return errval; +} + + +#endif // QPMS_COMPILE_PYTHON_EXTENSIONS + diff --git a/qpms/translations.h b/qpms/translations.h index 881bb5e..0cf45a7 100644 --- a/qpms/translations.h +++ b/qpms/translations.h @@ -78,6 +78,22 @@ int qpms_trans_calculator_get_AB_arrays_ext(const qpms_trans_calculator *c, size_t deststride, size_t srcstride, double kdlj_r, double kdlj_theta, double kdlj_phi, int r_ge_d, int J); - + +#ifdef QPMS_COMPILE_PYTHON_EXTENSIONS +#include +#include +int qpms_cython_trans_calculator_get_AB_arrays_loop( + const qpms_trans_calculator *c, qpms_bessel_t J, const int resnd, + int daxis, int saxis, + char *A_data, const npy_intp *A_shape, const npy_intp *A_strides, + char *B_data, const npy_intp *B_shape, const npy_intp *B_strides, + const char *r_data, const npy_intp *r_shape, const npy_intp *r_strides, + const char *theta_data, const npy_intp *theta_shape, const npy_intp *theta_strides, + const char *phi_data, const npy_intp *phi_shape, const npy_intp *phi_strides, + const char *r_ge_d_data, const npy_intp *r_ge_d_shape, const npy_intp *r_ge_d_strides); + + +#endif //QPMS_COMPILE_PYTHON_EXTENSIONS + #endif // QPMS_TRANSLATIONS_H