From 29a521db810507c0a2518c13cbd20da447a6bcc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ne=C4=8Dada?= Date: Mon, 22 Jul 2019 01:08:41 +0300 Subject: [PATCH] ss irrep acked translation matrix attempt Former-commit-id: 2d90bc2a40a7714eb3fa795b9fa9770387373513 --- TODO.md | 2 + qpms/qpms_c.pyx | 11 ++- qpms/qpms_cdefs.pxd | 4 +- qpms/scatsystem.c | 192 +++++++++++++++++++++++++++++++++++++++++++- qpms/scatsystem.h | 21 ++++- 5 files changed, 225 insertions(+), 5 deletions(-) diff --git a/TODO.md b/TODO.md index 445f787..de25fd1 100644 --- a/TODO.md +++ b/TODO.md @@ -26,6 +26,8 @@ TODO list before public release - Remove legacy code. - Prefix all identifiers. Maybe think about a different prefix than qpms? - Consistent indentation and style overall. +- Rewrite the parallelized translation matrix, mode problem matrix generators + in a way that reuses as much code as possible without copypasting Nice but less important features -------------------------------- diff --git a/qpms/qpms_c.pyx b/qpms/qpms_c.pyx index eddf328..dc626f9 100644 --- a/qpms/qpms_c.pyx +++ b/qpms/qpms_c.pyx @@ -1588,7 +1588,16 @@ cdef class ScatteringSystem: cdef np.ndarray[np.complex_t, ndim=2] target = np.empty( (flen,flen),dtype=complex, order='C') cdef cdouble[:,::1] target_view = target - qpms_scatsys_build_translation_matrix_full_e(&target_view[0][0], self.s, k, J) + qpms_scatsys_build_translation_matrix_e_full(&target_view[0][0], self.s, k, J) + return target + + def translation_matrix_packed(self, double k, qpms_iri_t iri, J = QPMS_HANKEL_PLUS): + cdef size_t rlen = self.saecv_sizes[iri] + cdef np.ndarray[np.complex_t, ndim=2] target = np.empty( + (rlen,rlen),dtype=complex, order='C') + cdef cdouble[:,::1] target_view = target + qpms_scatsys_build_translation_matrix_e_irrep_packed(&target_view[0][0], + self.s, iri, k, J) return target def fullvec_psizes(self): diff --git a/qpms/qpms_cdefs.pxd b/qpms/qpms_cdefs.pxd index 1f06284..f9d8914 100644 --- a/qpms/qpms_cdefs.pxd +++ b/qpms/qpms_cdefs.pxd @@ -368,10 +368,12 @@ cdef extern from "scatsystem.h": const qpms_scatsys_t *ss, double k) cdouble *qpms_scatsys_build_translation_matrix_full(cdouble *target, const qpms_scatsys_t *ss, double k) - cdouble *qpms_scatsys_build_translation_matrix_full_e(cdouble *target, + cdouble *qpms_scatsys_build_translation_matrix_e_full(cdouble *target, const qpms_scatsys_t *ss, double k, qpms_bessel_t J) cdouble *qpms_scatsys_build_modeproblem_matrix_irrep_packed(cdouble *target, const qpms_scatsys_t *ss, qpms_iri_t iri, double k) + cdouble *qpms_scatsys_build_translation_matrix_e_irrep_packed(cdouble *target, + const qpms_scatsys_t *ss, qpms_iri_t iri, double k, qpms_bessel_t J) cdouble *qpms_scatsys_build_modeproblem_matrix_irrep_packed_orbitorderR( cdouble *target, const qpms_scatsys_t *ss, qpms_iri_t iri, double k) cdouble *qpms_scatsys_build_modeproblem_matrix_irrep_packed_parallelR( diff --git a/qpms/scatsystem.c b/qpms/scatsystem.c index f79e9e0..1c7158e 100644 --- a/qpms/scatsystem.c +++ b/qpms/scatsystem.c @@ -965,11 +965,11 @@ complex double *qpms_scatsys_build_translation_matrix_full( double k ///< Wave number to use in the translation matrix. ) { - return qpms_scatsys_build_translation_matrix_full_e( + return qpms_scatsys_build_translation_matrix_e_full( target, ss, k, QPMS_HANKEL_PLUS); } -complex double *qpms_scatsys_build_translation_matrix_full_e( +complex double *qpms_scatsys_build_translation_matrix_e_full( /// Target memory with capacity for ss->fecv_size**2 elements. If NULL, new will be allocated. complex double *target, const qpms_scatsys_t *ss, @@ -1008,6 +1008,7 @@ complex double *qpms_scatsys_build_translation_matrix_full_e( } + complex double *qpms_scatsys_build_modeproblem_matrix_full( /// Target memory with capacity for ss->fecv_size**2 elements. If NULL, new will be allocated. complex double *target, @@ -1422,6 +1423,193 @@ static void *qpms_scatsys_build_modeproblem_matrix_irrep_packed_parallelR_thread return NULL; } +// this differs from the ...build_modeproblem_matrix... only by the `J` +// maybe I should use this one there as well to save lines... TODO +struct qpms_scatsys_build_translation_matrix_e_irrep_packed_parallelR_thread_arg{ + const qpms_scatsys_t *ss; + qpms_ss_pi_t *opistartR_ptr; + pthread_mutex_t *opistartR_mutex; + qpms_iri_t iri; + complex double *target_packed; + double k; + qpms_bessel_t J; +}; + +static void *qpms_scatsys_build_translation_matrix_e_irrep_packed_parallelR_thread(void *arg) +{ + const struct qpms_scatsys_build_translation_matrix_e_irrep_packed_parallelR_thread_arg + *a = arg; + const qpms_scatsys_t *ss = a->ss; + const qpms_iri_t iri = a->iri; + const size_t packedlen = ss->saecv_sizes[iri]; + const qpms_bessel_t J = a->J; + + // some of the following workspaces are probably redundant; TODO optimize later. + + // workspace for the uncompressed particle<-particle tranlation matrix block + complex double *Sblock; + QPMS_CRASHING_MALLOC(Sblock, sizeof(complex double)*SQ(ss->max_bspecn)); + + // Workspace for the intermediate particle-orbit matrix result + complex double *tmp; + QPMS_CRASHING_MALLOC(tmp, sizeof(complex double) * SQ(ss->max_bspecn) * ss->sym->order); + + const complex double one = 1, zero = 0; + + while(1) { + // In the beginning, pick a target (row) orbit for this thread + QPMS_ENSURE_SUCCESS(pthread_mutex_lock(a->opistartR_mutex)); + if(*(a->opistartR_ptr) >= ss->p_count) {// Everything is already done, end + QPMS_ENSURE_SUCCESS(pthread_mutex_unlock(a->opistartR_mutex)); + break; + } + const qpms_ss_pi_t opistartR = *(a->opistartR_ptr); + // Now increment it for another thread: + *(a->opistartR_ptr) += ss->orbit_types[ss->p_orbitinfo[ss->p_by_orbit[opistartR]].t].size; + QPMS_ENSURE_SUCCESS(pthread_mutex_unlock(a->opistartR_mutex)); + + // Orbit picked (defined by opistartR), do the work. + const qpms_ss_pi_t orbitstartpiR = ss->p_by_orbit[opistartR]; + const qpms_ss_oti_t otiR = ss->p_orbitinfo[orbitstartpiR].t; + const qpms_ss_osn_t osnR = ss->p_orbitinfo[orbitstartpiR].osn; + const qpms_ss_orbit_type_t *const otR = ss->orbit_types + otiR; + const qpms_ss_orbit_pi_t orbit_p_countR = otR->size; + const size_t orbit_packedsizeR = otR->irbase_sizes[iri]; + + if(orbit_packedsizeR) { // avoid zgemm crash on empty irrep + const size_t particle_fullsizeR = otR->bspecn; // == bspecR->n + const qpms_vswf_set_spec_t *bspecR = ss->tm[ss->p[orbitstartpiR].tmatrix_id]->spec; + // This is the orbit-level matrix projecting the whole orbit onto the irrep. + const complex double *omR = otR->irbases + otR->irbase_offsets[iri]; + // Orbit coeff vector's full size: + const size_t orbit_fullsizeR = otR->size * otR->bspecn; + // This is where the orbit starts in the "packed" vector: + const size_t packed_orbit_offsetR = + ss->saecv_ot_offsets[iri*ss->orbit_type_count + otiR] + + osnR * otR->irbase_sizes[iri]; + for(qpms_ss_orbit_pi_t opiR = 0; opiR < orbit_p_countR; ++opiR) { + qpms_ss_pi_t piR = ss->p_by_orbit[opistartR + opiR]; + assert(opiR == ss->p_orbitinfo[piR].p); + assert(otiR == ss->p_orbitinfo[piR].t); + assert(ss->p_orbitinfo[piR].osn == osnR); + const cart3_t posR = ss->p[piR].pos; + for(qpms_ss_pi_t piC = 0; piC < ss->p_count; ++piC) { //Column loop + const qpms_ss_oti_t otiC = ss->p_orbitinfo[piC].t; + const qpms_ss_orbit_type_t *const otC = ss->orbit_types + otiC; + const qpms_ss_osn_t osnC = ss->p_orbitinfo[piC].osn; + const qpms_ss_orbit_pi_t opiC = ss->p_orbitinfo[piC].p; + // This is where the particle's orbit starts in the "packed" vector: + const size_t packed_orbit_offsetC = + ss->saecv_ot_offsets[iri*ss->orbit_type_count + otiC] + + osnC * otC->irbase_sizes[iri]; + const qpms_vswf_set_spec_t *bspecC = ss->tm[ss->p[piC].tmatrix_id]->spec; + // Orbit coeff vector's full size: + const size_t orbit_fullsizeC = otC->size * otC->bspecn; + const size_t particle_fullsizeC = otC->bspecn; // == bspecC->n + const size_t orbit_packedsizeC = otC->irbase_sizes[iri]; + // This is the orbit-level matrix projecting the whole orbit onto the irrep. + const complex double *omC = otC->irbases + otC->irbase_offsets[iri]; + + if(orbit_packedsizeC) { // avoid zgemm crash on empty irrep + // THIS IS THE MAIN PART DIFFERENT FROM ...modeproblem...() TODO unify + // somehow to save lines + if(piC != piR) { // non-diagonal, calculate S + const cart3_t posC = ss->p[piC].pos; + QPMS_ENSURE_SUCCESS(qpms_trans_calculator_get_trans_array_lc3p(ss->c, + Sblock, // Sblock is S(piR->piC) + bspecR, bspecC->n, bspecC, 1, + a->k, posR, posC, J)); + } else { // diagonal, fill with zeros; TODO does this make sense? + // would unit matrix be better? or unit only for QPMS_BESSEL_REGULAR? + for (size_t row = 0; row < bspecR->n; ++row) + for (size_t col = 0; col < bspecC->n; ++col) + Sblock[row * bspecC->n + col] = 0; //(row == col)? 1 : 0; + } + + // tmp[oiR|piR,piC] = ∑_K M[piR,K] U*[K,piC] + SERIAL_ZGEMM(CblasRowMajor, CblasNoTrans, CblasConjTrans, + particle_fullsizeR /*M*/, orbit_packedsizeC /*N*/, particle_fullsizeC /*K*/, + &one /*alpha*/, Sblock/*A*/, particle_fullsizeC/*ldA*/, + omC + opiC*particle_fullsizeC /*B*/, + orbit_fullsizeC/*ldB*/, &zero /*beta*/, + tmp /*C*/, orbit_packedsizeC /*LDC*/); + + // target[oiR|piR,oiC|piC] += U[...] tmp[...] + SERIAL_ZGEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, + orbit_packedsizeR /*M*/, orbit_packedsizeC /*N*/, particle_fullsizeR /*K*/, + &one /*alpha*/, omR + opiR*particle_fullsizeR/*A*/, orbit_fullsizeR/*ldA*/, + tmp /*B*/, orbit_packedsizeC /*ldB*/, &one /*beta*/, + a->target_packed + packedlen*packed_orbit_offsetR + packed_orbit_offsetC /*C*/, + packedlen /*ldC*/); + } + } + } + } + + + + } + free(tmp); + free(Sblock); + return NULL; +} + +// Almost the same as ...build_modeproblem_matrix_...parallelR +// --> TODO write this in a more generic way to save LoC. +complex double *qpms_scatsys_build_translation_matrix_e_irrep_packed( + /// Target memory with capacity for ss->fecv_size**2 elements. If NULL, new will be allocated. + complex double *target_packed, + const qpms_scatsys_t *ss, + qpms_iri_t iri, + double k, ///< Wave number to use in the translation matrix. + qpms_bessel_t J + ) +{ + QPMS_UNTESTED; + const size_t packedlen = ss->saecv_sizes[iri]; + if (!packedlen) // THIS IS A BIT PROBLEMATIC, TODO how to deal with empty irreps? + return target_packed; + if (target_packed == NULL) + target_packed = malloc(SQ(packedlen)*sizeof(complex double)); + if (target_packed == NULL) abort(); + memset(target_packed, 0, SQ(packedlen)*sizeof(complex double)); + + qpms_ss_pi_t opistartR = 0; + pthread_mutex_t opistartR_mutex; + QPMS_ENSURE_SUCCESS(pthread_mutex_init(&opistartR_mutex, NULL)); + const struct qpms_scatsys_build_translation_matrix_e_irrep_packed_parallelR_thread_arg + arg = {ss, &opistartR, &opistartR_mutex, iri, target_packed, k, J}; + + // FIXME THIS IS NOT PORTABLE: + long nthreads; + if (qpms_scatsystem_nthreads_override > 0) { + nthreads = qpms_scatsystem_nthreads_override; + QPMS_DEBUG(QPMS_DBGMSG_THREADS, "Using overriding value of %ld thread(s).", + nthreads); + } else { + nthreads = sysconf(_SC_NPROCESSORS_ONLN); + if (nthreads < 1) { + QPMS_DEBUG(QPMS_DBGMSG_THREADS, "_SC_NPROCESSORS_ONLN returned %ld, using %ld thread(s) instead.", + nthreads, qpms_scatsystem_nthreads_default); + nthreads = qpms_scatsystem_nthreads_default; + } else { + QPMS_DEBUG(QPMS_DBGMSG_THREADS, "_SC_NRPOCESSORS_ONLN returned %ld.", nthreads); + } + } + pthread_t thread_ids[nthreads]; + for(long thi = 0; thi < nthreads; ++thi) + QPMS_ENSURE_SUCCESS(pthread_create(thread_ids + thi, NULL, + qpms_scatsys_build_translation_matrix_e_irrep_packed_parallelR_thread, + (void *) &arg)); + for(long thi = 0; thi < nthreads; ++thi) { + void *retval; + QPMS_ENSURE_SUCCESS(pthread_join(thread_ids[thi], &retval)); + } + + QPMS_ENSURE_SUCCESS(pthread_mutex_destroy(&opistartR_mutex)); + return target_packed; +} + complex double *qpms_scatsys_build_modeproblem_matrix_irrep_packed_parallelR( /// Target memory with capacity for ss->saecv_sizes[iri]**2 elements. If NULL, new will be allocated. complex double *target_packed, diff --git a/qpms/scatsystem.h b/qpms/scatsystem.h index 7f5c5b0..ba8646b 100644 --- a/qpms/scatsystem.h +++ b/qpms/scatsystem.h @@ -228,6 +228,11 @@ complex double *qpms_scatsys_irrep_unpack_vector(complex double *target_full, const complex double *orig_packed, const qpms_scatsys_t *ss, qpms_iri_t iri, bool add); +/// Global translation matrix. +/** + * The diagonal (particle self-) block are filled with zeros. + * This may change in the future. + */ complex double *qpms_scatsys_build_translation_matrix_full( /// Target memory with capacity for ss->fecv_size**2 elements. If NULL, new will be allocated. complex double *target, @@ -238,7 +243,7 @@ complex double *qpms_scatsys_build_translation_matrix_full( /// As qpms_scatsys_build_translation_full() but with choice of Bessel function type. /** Might be useful for evaluation of cross sections and testing. */ -complex double *qpms_scatsys_build_translation_matrix_full_e( +complex double *qpms_scatsys_build_translation_matrix_e_full( /// Target memory with capacity for ss->fecv_size**2 elements. If NULL, new will be allocated. complex double *target, const qpms_scatsys_t *ss, @@ -246,6 +251,20 @@ complex double *qpms_scatsys_build_translation_matrix_full_e( qpms_bessel_t J ); +/// Global translation matrix with selectable Bessel function, projected on an irrep. +/** + * The diagonal (particle self-) blocks are currently filled with zeros. + * This may change in the future. + */ +complex double *qpms_scatsys_build_translation_matrix_e_irrep_packed( + /// Target memory with capacity for ss->fecv_size**2 elements. If NULL, new will be allocated. + complex double *target, + const qpms_scatsys_t *ss, + qpms_iri_t iri, + double k, ///< Wave number to use in the translation matrix. + qpms_bessel_t J + ); + complex double *qpms_scatsys_build_modeproblem_matrix_full( /// Target memory with capacity for ss->fecv_size**2 elements. If NULL, new will be allocated. complex double *target,