Parallel modeproblem matrix fixed?
Former-commit-id: 9ad51b186a68689a754ce986d7f8bf2f97ac258f
This commit is contained in:
parent
8f4a8c7c7b
commit
96c9e95ea0
|
@ -251,15 +251,21 @@ complex double *qpms_scatsysw_build_modeproblem_matrix_full_boosted(
|
||||||
for(qpms_ss_pi_t piC = 0; piC < ss->p_count; ++piC) {
|
for(qpms_ss_pi_t piC = 0; piC < ss->p_count; ++piC) {
|
||||||
const qpms_vswf_set_spec_t *bspecC = ssw->tm[ss->p[piC].tmatrix_id]->spec;
|
const qpms_vswf_set_spec_t *bspecC = ssw->tm[ss->p[piC].tmatrix_id]->spec;
|
||||||
if(piC != piR) { // The diagonal will be dealt with later.
|
if(piC != piR) { // The diagonal will be dealt with later.
|
||||||
uoppid_t pid = uopairid(ss->p_count, piC, piR);
|
|
||||||
const cart3_t posC = ss->p[piC].pos;
|
const cart3_t posC = ss->p[piC].pos;
|
||||||
const sph_t dlj = cart2sph(cart3_substract(posR, posC));
|
#if 0
|
||||||
const size_t ri = b->r_map[pid];
|
QPMS_ENSURE_SUCCESS(qpms_trans_calculator_get_trans_array_lc3p(ss->c,
|
||||||
QPMS_PARANOID_ASSERT(dlj.r == b->r[ri]);
|
tmp, // tmp is S(piR<-piC)
|
||||||
const qpms_l_t pair_lMax = b->lMax_r[ri];
|
bspecR, bspecC->n, bspecC, 1,
|
||||||
const qpms_y_t pair_nelem = qpms_lMax2nelem(pair_lMax);
|
k, posR, posC, QPMS_HANKEL_PLUS));
|
||||||
|
#else
|
||||||
{ // this replaces qpms_trans_calculator_get_trans_array():
|
{ // this replaces qpms_trans_calculator_get_trans_array():
|
||||||
// R is dest, C is src
|
// R is dest, C is src
|
||||||
|
const sph_t dlj = cart2sph(cart3_substract(posR, posC));
|
||||||
|
const uoppid_t pid = uopairid(ss->p_count, piC, piR);
|
||||||
|
const size_t ri = b->r_map[pid];
|
||||||
|
QPMS_PARANOID_ASSERT(dlj.r == b->r[ri]);
|
||||||
|
const qpms_l_t pair_lMax = b->lMax_r[ri];
|
||||||
|
const qpms_y_t pair_nelem = qpms_lMax2nelem(pair_lMax);
|
||||||
QPMS_PARANOID_ASSERT(c->normalisation == bspecC->norm && c->normalisation == bspecR->norm);
|
QPMS_PARANOID_ASSERT(c->normalisation == bspecC->norm && c->normalisation == bspecR->norm);
|
||||||
QPMS_PARANOID_ASSERT(c->lMax >= bspecC->lMax && c->lMax >= bspecR->lMax);
|
QPMS_PARANOID_ASSERT(c->lMax >= bspecC->lMax && c->lMax >= bspecR->lMax);
|
||||||
QPMS_PARANOID_ASSERT(bspecC->lMax_L < 0 && bspecR->lMax_L < 0);
|
QPMS_PARANOID_ASSERT(bspecC->lMax_L < 0 && bspecR->lMax_L < 0);
|
||||||
|
@ -275,6 +281,7 @@ complex double *qpms_scatsysw_build_modeproblem_matrix_full_boosted(
|
||||||
bspecR, bspecC->n, bspecC, 1, A, B, pair_lMax);
|
bspecR, bspecC->n, bspecC, 1, A, B, pair_lMax);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||||
bspecR->n /*m*/, bspecC->n /*n*/, bspecR->n /*k*/,
|
bspecR->n /*m*/, bspecC->n /*n*/, bspecR->n /*k*/,
|
||||||
&minusone/*alpha*/, tmmR/*a*/, bspecR->n/*lda*/,
|
&minusone/*alpha*/, tmmR/*a*/, bspecR->n/*lda*/,
|
||||||
|
@ -301,15 +308,14 @@ void *qpms_scatsysw_build_modeproblem_matrix_irrep_packed_parallelR_thread_boost
|
||||||
const struct qpms_scatsysw_build_modeproblem_matrix_irrep_packed_parallelR_thread_arg
|
const struct qpms_scatsysw_build_modeproblem_matrix_irrep_packed_parallelR_thread_arg
|
||||||
*a = arg;
|
*a = arg;
|
||||||
const qpms_scatsys_at_omega_t *ssw = a->ssw;
|
const qpms_scatsys_at_omega_t *ssw = a->ssw;
|
||||||
const complex double k = ssw->wavenumber;
|
|
||||||
const qpms_scatsys_t *ss = ssw->ss;
|
|
||||||
const qpms_iri_t iri = a->iri;
|
|
||||||
const size_t packedlen = ss->saecv_sizes[iri];
|
|
||||||
|
|
||||||
QPMS_ASSERT(ssw->translation_cache && ssw->ss->tbooster);
|
QPMS_ASSERT(ssw->translation_cache && ssw->ss->tbooster);
|
||||||
|
const qpms_scatsys_t * const ss = ssw->ss;
|
||||||
|
const qpms_trans_calculator *const c = ss->c;
|
||||||
const booster_t *const b = ss->tbooster;
|
const booster_t *const b = ss->tbooster;
|
||||||
const boosterw_t *const bw = ssw->translation_cache;
|
const boosterw_t *const bw = ssw->translation_cache;
|
||||||
const qpms_trans_calculator *const c = ss->c;
|
const complex double k = ssw->wavenumber;
|
||||||
|
const qpms_iri_t iri = a->iri;
|
||||||
|
const size_t packedlen = ss->saecv_sizes[iri];
|
||||||
|
|
||||||
// some of the following workspaces are probably redundant; TODO optimize later.
|
// some of the following workspaces are probably redundant; TODO optimize later.
|
||||||
|
|
||||||
|
@ -318,15 +324,15 @@ void *qpms_scatsysw_build_modeproblem_matrix_irrep_packed_parallelR_thread_boost
|
||||||
complex double *Sblock, *TSblock;
|
complex double *Sblock, *TSblock;
|
||||||
QPMS_CRASHING_MALLOC(Sblock, sizeof(complex double)*SQ(ss->max_bspecn));
|
QPMS_CRASHING_MALLOC(Sblock, sizeof(complex double)*SQ(ss->max_bspecn));
|
||||||
QPMS_CRASHING_MALLOC(TSblock, sizeof(complex double)*SQ(ss->max_bspecn));
|
QPMS_CRASHING_MALLOC(TSblock, sizeof(complex double)*SQ(ss->max_bspecn));
|
||||||
// Workspaces for the translation operator A and B matrices
|
|
||||||
complex double *A, *B;
|
|
||||||
QPMS_CRASHING_MALLOC(A, SQ(c->nelem) * sizeof(*A));
|
|
||||||
QPMS_CRASHING_MALLOC(B, SQ(c->nelem) * sizeof(*B));
|
|
||||||
double legendre_buf[gsl_sf_legendre_array_n(2*c->lMax + 1)]; //VLA, workspace for legendre arrays
|
|
||||||
|
|
||||||
// Workspace for the intermediate particle-orbit matrix result
|
// Workspace for the intermediate particle-orbit matrix result
|
||||||
complex double *tmp;
|
complex double *tmp;
|
||||||
QPMS_CRASHING_MALLOC(tmp, sizeof(complex double) * SQ(ss->max_bspecn) * ss->sym->order);
|
QPMS_CRASHING_MALLOC(tmp, sizeof(complex double) * SQ(ss->max_bspecn) * ss->sym->order);
|
||||||
|
// Workspace for A, B arrays
|
||||||
|
complex double *A, *B;
|
||||||
|
QPMS_CRASHING_MALLOC(A, SQ(c->nelem) * sizeof(*A));
|
||||||
|
QPMS_CRASHING_MALLOC(B, SQ(c->nelem) * sizeof(*B));
|
||||||
|
double legendre_buf[gsl_sf_legendre_array_n(2*c->lMax + 1)]; //VLA, workspace for legendre arrays
|
||||||
|
|
||||||
const complex double one = 1, zero = 0, minusone = -1;
|
const complex double one = 1, zero = 0, minusone = -1;
|
||||||
|
|
||||||
|
@ -362,99 +368,103 @@ void *qpms_scatsysw_build_modeproblem_matrix_irrep_packed_parallelR_thread_boost
|
||||||
ss->saecv_ot_offsets[iri*ss->orbit_type_count + otiR]
|
ss->saecv_ot_offsets[iri*ss->orbit_type_count + otiR]
|
||||||
+ osnR * otR->irbase_sizes[iri];
|
+ osnR * otR->irbase_sizes[iri];
|
||||||
for(qpms_ss_orbit_pi_t opiR = 0; opiR < orbit_p_countR; ++opiR) {
|
for(qpms_ss_orbit_pi_t opiR = 0; opiR < orbit_p_countR; ++opiR) {
|
||||||
for(qpms_ss_orbit_pi_t opiR = 0; opiR < orbit_p_countR; ++opiR) {
|
qpms_ss_pi_t piR = ss->p_by_orbit[opistartR + opiR];
|
||||||
qpms_ss_pi_t piR = ss->p_by_orbit[opistartR + opiR];
|
assert(opiR == ss->p_orbitinfo[piR].p);
|
||||||
assert(opiR == ss->p_orbitinfo[piR].p);
|
assert(otiR == ss->p_orbitinfo[piR].t);
|
||||||
assert(otiR == ss->p_orbitinfo[piR].t);
|
assert(ss->p_orbitinfo[piR].osn == osnR);
|
||||||
assert(ss->p_orbitinfo[piR].osn == osnR);
|
const cart3_t posR = ss->p[piR].pos;
|
||||||
const cart3_t posR = ss->p[piR].pos;
|
// dest particle T-matrix
|
||||||
// dest particle T-matrix
|
const complex double *tmmR = ssw->tm[ss->p[piR].tmatrix_id]->m;
|
||||||
const complex double *tmmR = ssw->tm[ss->p[piR].tmatrix_id]->m;
|
for(qpms_ss_pi_t piC = 0; piC < ss->p_count; ++piC) { //Column loop
|
||||||
for(qpms_ss_pi_t piC = 0; piC < ss->p_count; ++piC) { //Column loop
|
const qpms_ss_oti_t otiC = ss->p_orbitinfo[piC].t;
|
||||||
const qpms_ss_oti_t otiC = ss->p_orbitinfo[piC].t;
|
const qpms_ss_orbit_type_t *const otC = ss->orbit_types + otiC;
|
||||||
const qpms_ss_orbit_type_t *const otC = ss->orbit_types + otiC;
|
const qpms_ss_osn_t osnC = ss->p_orbitinfo[piC].osn;
|
||||||
const qpms_ss_osn_t osnC = ss->p_orbitinfo[piC].osn;
|
const qpms_ss_orbit_pi_t opiC = ss->p_orbitinfo[piC].p;
|
||||||
const qpms_ss_orbit_pi_t opiC = ss->p_orbitinfo[piC].p;
|
// This is where the particle's orbit starts in the "packed" vector:
|
||||||
// This is where the particle's orbit starts in the "packed" vector:
|
const size_t packed_orbit_offsetC =
|
||||||
const size_t packed_orbit_offsetC =
|
ss->saecv_ot_offsets[iri*ss->orbit_type_count + otiC]
|
||||||
ss->saecv_ot_offsets[iri*ss->orbit_type_count + otiC]
|
+ osnC * otC->irbase_sizes[iri];
|
||||||
+ osnC * otC->irbase_sizes[iri];
|
const qpms_vswf_set_spec_t *bspecC = ssw->tm[ss->p[piC].tmatrix_id]->spec;
|
||||||
const qpms_vswf_set_spec_t *bspecC = ssw->tm[ss->p[piC].tmatrix_id]->spec;
|
// Orbit coeff vector's full size:
|
||||||
// Orbit coeff vector's full size:
|
const size_t orbit_fullsizeC = otC->size * otC->bspecn;
|
||||||
const size_t orbit_fullsizeC = otC->size * otC->bspecn;
|
const size_t particle_fullsizeC = otC->bspecn; // == bspecC->n
|
||||||
const size_t particle_fullsizeC = otC->bspecn; // == bspecC->n
|
const size_t orbit_packedsizeC = otC->irbase_sizes[iri];
|
||||||
const size_t orbit_packedsizeC = otC->irbase_sizes[iri];
|
// This is the orbit-level matrix projecting the whole orbit onto the irrep.
|
||||||
// This is the orbit-level matrix projecting the whole orbit onto the irrep.
|
const complex double *omC = otC->irbases + otC->irbase_offsets[iri];
|
||||||
const complex double *omC = otC->irbases + otC->irbase_offsets[iri];
|
|
||||||
|
|
||||||
if(orbit_packedsizeC) { // avoid zgemm crash on empty irrep
|
if(orbit_packedsizeC) { // avoid zgemm crash on empty irrep
|
||||||
if(piC != piR) { // non-diagonal, calculate TS
|
if(piC != piR) { // non-diagonal, calculate TS
|
||||||
const cart3_t posC = ss->p[piC].pos;
|
const cart3_t posC = ss->p[piC].pos;
|
||||||
#if 0
|
#if 0
|
||||||
QPMS_ENSURE_SUCCESS(qpms_trans_calculator_get_trans_array_lc3p(ss->c,
|
QPMS_ENSURE_SUCCESS(qpms_trans_calculator_get_trans_array_lc3p(ss->c,
|
||||||
Sblock, // Sblock is S(piR->piC)
|
Sblock, // Sblock is S(piR->piC)
|
||||||
bspecR, bspecC->n, bspecC, 1,
|
bspecR, bspecC->n, bspecC, 1,
|
||||||
k, posR, posC, QPMS_HANKEL_PLUS));
|
k, posR, posC, QPMS_HANKEL_PLUS));
|
||||||
#endif
|
#else
|
||||||
{ // this block replaces qpms_trans_calculator_get_trans_array():
|
{ // this block replaces qpms_trans_calculator_get_trans_array():
|
||||||
// R is dest, C is src
|
// R is dest, C is src
|
||||||
const sph_t dlj = cart2sph(cart3_substract(posR, posC));
|
const sph_t dlj = cart2sph(cart3_substract(posR, posC));
|
||||||
const uoppid_t pid = uopairid(ss->p_count, piC, piR);
|
const uoppid_t pid = uopairid(ss->p_count, piC, piR);
|
||||||
const size_t ri = b->r_map[pid];
|
const size_t ri = b->r_map[pid];
|
||||||
QPMS_PARANOID_ASSERT(dlj.r == b->r[ri]);
|
QPMS_PARANOID_ASSERT(dlj.r == b->r[ri]);
|
||||||
const qpms_l_t pair_lMax = b->lMax_r[ri];
|
const qpms_l_t pair_lMax = b->lMax_r[ri];
|
||||||
const qpms_y_t pair_nelem = qpms_lMax2nelem(pair_lMax);
|
const qpms_y_t pair_nelem = qpms_lMax2nelem(pair_lMax);
|
||||||
|
QPMS_PARANOID_ASSERT(c->normalisation == bspecC->norm && c->normalisation == bspecR->norm);
|
||||||
{ // this replaces qpms_trans_calculator_get_AB_arrays() and _buf()
|
QPMS_PARANOID_ASSERT(c->lMax >= bspecC->lMax && c->lMax >= bspecR->lMax);
|
||||||
const double costheta = cos(dlj.theta);
|
QPMS_PARANOID_ASSERT(bspecC->lMax_L < 0 && bspecR->lMax_L < 0);
|
||||||
QPMS_ENSURE_SUCCESS(gsl_sf_legendre_array_e(GSL_SF_LEGENDRE_NONE,
|
{ // this replaces qpms_trans_calculator_get_AB_arrays() and _buf()
|
||||||
2*pair_lMax+1, costheta, -1, legendre_buf));
|
const double costheta = cos(dlj.theta);
|
||||||
const double * const legendres = legendre_buf;
|
QPMS_ENSURE_SUCCESS(gsl_sf_legendre_array_e(GSL_SF_LEGENDRE_NONE,
|
||||||
const complex double * const bessels = bw->bessels + b->bessel_offsets_r[ri];
|
2*pair_lMax+1, costheta, -1, legendre_buf));
|
||||||
qpms_trans_calculator_get_AB_arrays_precalcbuf(c, pair_lMax, A, B,
|
const double * const legendres = legendre_buf;
|
||||||
/*deststride*/ pair_nelem, /*srcstride*/ 1, dlj.phi, bessels, legendres);
|
const complex double * const bessels = bw->bessels + b->bessel_offsets_r[ri];
|
||||||
}
|
qpms_trans_calculator_get_AB_arrays_precalcbuf(c, pair_lMax, A, B,
|
||||||
qpms_trans_array_from_AB(Sblock, // Sblock is S(piR->piC)
|
/*deststride*/ pair_nelem, /*srcstride*/ 1, dlj.phi, bessels, legendres);
|
||||||
bspecR, bspecC->n, bspecC, 1, A, B, pair_lMax);
|
|
||||||
}
|
}
|
||||||
|
qpms_trans_array_from_AB(Sblock, // Sblock is S(piR->piC)
|
||||||
SERIAL_ZGEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
bspecR, bspecC->n, bspecC, 1, A, B, pair_lMax);
|
||||||
bspecR->n /*m*/, bspecC->n /*n*/, bspecR->n /*k*/,
|
|
||||||
&minusone/*alpha*/, tmmR/*a*/, bspecR->n/*lda*/,
|
|
||||||
Sblock/*b*/, bspecC->n/*ldb*/, &zero/*beta*/,
|
|
||||||
TSblock /*c*/, bspecC->n /*ldc*/);
|
|
||||||
} else { // diagonal, fill with diagonal +1
|
|
||||||
for (size_t row = 0; row < bspecR->n; ++row)
|
|
||||||
for (size_t col = 0; col < bspecC->n; ++col)
|
|
||||||
TSblock[row * bspecC->n + col] = (row == col)? +1 : 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// tmp[oiR|piR,piC] = ∑_K M[piR,K] U*[K,piC]
|
#endif
|
||||||
SERIAL_ZGEMM(CblasRowMajor, CblasNoTrans, CblasConjTrans,
|
|
||||||
particle_fullsizeR /*M*/, orbit_packedsizeC /*N*/, particle_fullsizeC /*K*/,
|
|
||||||
&one /*alpha*/, TSblock/*A*/, particle_fullsizeC/*ldA*/,
|
|
||||||
omC + opiC*particle_fullsizeC /*B*/,
|
|
||||||
orbit_fullsizeC/*ldB*/, &zero /*beta*/,
|
|
||||||
tmp /*C*/, orbit_packedsizeC /*LDC*/);
|
|
||||||
|
|
||||||
// target[oiR|piR,oiC|piC] += U[...] tmp[...]
|
|
||||||
SERIAL_ZGEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
SERIAL_ZGEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||||
orbit_packedsizeR /*M*/, orbit_packedsizeC /*N*/, particle_fullsizeR /*K*/,
|
bspecR->n /*m*/, bspecC->n /*n*/, bspecR->n /*k*/,
|
||||||
&one /*alpha*/, omR + opiR*particle_fullsizeR/*A*/, orbit_fullsizeR/*ldA*/,
|
&minusone/*alpha*/, tmmR/*a*/, bspecR->n/*lda*/,
|
||||||
tmp /*B*/, orbit_packedsizeC /*ldB*/, &one /*beta*/,
|
Sblock/*b*/, bspecC->n/*ldb*/, &zero/*beta*/,
|
||||||
a->target_packed + packedlen*packed_orbit_offsetR + packed_orbit_offsetC /*C*/,
|
TSblock /*c*/, bspecC->n /*ldc*/);
|
||||||
packedlen /*ldC*/);
|
} else { // diagonal, fill with diagonal +1
|
||||||
|
for (size_t row = 0; row < bspecR->n; ++row)
|
||||||
|
for (size_t col = 0; col < bspecC->n; ++col)
|
||||||
|
TSblock[row * bspecC->n + col] = (row == col)? +1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// tmp[oiR|piR,piC] = ∑_K M[piR,K] U*[K,piC]
|
||||||
|
SERIAL_ZGEMM(CblasRowMajor, CblasNoTrans, CblasConjTrans,
|
||||||
|
particle_fullsizeR /*M*/, orbit_packedsizeC /*N*/, particle_fullsizeC /*K*/,
|
||||||
|
&one /*alpha*/, TSblock/*A*/, particle_fullsizeC/*ldA*/,
|
||||||
|
omC + opiC*particle_fullsizeC /*B*/,
|
||||||
|
orbit_fullsizeC/*ldB*/, &zero /*beta*/,
|
||||||
|
tmp /*C*/, orbit_packedsizeC /*LDC*/);
|
||||||
|
|
||||||
|
// target[oiR|piR,oiC|piC] += U[...] tmp[...]
|
||||||
|
SERIAL_ZGEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
||||||
|
orbit_packedsizeR /*M*/, orbit_packedsizeC /*N*/, particle_fullsizeR /*K*/,
|
||||||
|
&one /*alpha*/, omR + opiR*particle_fullsizeR/*A*/, orbit_fullsizeR/*ldA*/,
|
||||||
|
tmp /*B*/, orbit_packedsizeC /*ldB*/, &one /*beta*/,
|
||||||
|
a->target_packed + packedlen*packed_orbit_offsetR + packed_orbit_offsetC /*C*/,
|
||||||
|
packedlen /*ldC*/);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
free(tmp);
|
|
||||||
free(A);
|
free(A);
|
||||||
free(B);
|
free(B);
|
||||||
|
free(tmp);
|
||||||
free(Sblock);
|
free(Sblock);
|
||||||
free(TSblock);
|
free(TSblock);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1647,9 +1647,6 @@ static void *qpms_scatsys_build_translation_matrix_e_irrep_packed_parallelR_thre
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
free(tmp);
|
free(tmp);
|
||||||
free(Sblock);
|
free(Sblock);
|
||||||
|
@ -1810,7 +1807,6 @@ complex double *qpms_scatsysw_apply_Tmatrices_full(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ccart3_t qpms_scatsys_eval_E(const qpms_scatsys_t *ss,
|
ccart3_t qpms_scatsys_eval_E(const qpms_scatsys_t *ss,
|
||||||
const complex double *cvf, const cart3_t where,
|
const complex double *cvf, const cart3_t where,
|
||||||
const complex double k) {
|
const complex double k) {
|
||||||
|
|
Loading…
Reference in New Issue