From e4f368f47a9fc9e48edce4be6e535377cb73e84e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ne=C4=8Dada?= <marek@necada.org>
Date: Fri, 8 Nov 2019 17:04:13 +0200
Subject: [PATCH] Multi-threaded integration in axial symmetric T-matrix.

Former-commit-id: 71dc4ae5978f0dc10d82f4d8450fb2117ba64ce8
---
 qpms/tmatrices.c | 174 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 123 insertions(+), 51 deletions(-)

diff --git a/qpms/tmatrices.c b/qpms/tmatrices.c
index 9b86c47..78a1eea 100644
--- a/qpms/tmatrices.c
+++ b/qpms/tmatrices.c
@@ -23,6 +23,7 @@
 #include "qpms_specfunc.h"
 #include "normalisation.h"
 #include <errno.h>
+#include <pthread.h>
 #include <gsl/gsl_integration.h>
 
 // These are quite arbitrarily chosen constants for the quadrature in qpms_tmatrix_axialsym_fill()
@@ -474,11 +475,12 @@ qpms_errno_t qpms_load_scuff_tmatrix(
     complex double ** const tmdata
     ) {
   FILE *f = fopen(path, "r");
-  if (!f) 
+  if (!f) {
     if (qpms_load_scuff_tmatrix_crash_on_failure)
       qpms_pr_error_at_line(__FILE__, __LINE__, __func__,
           "Could not open T-matrix file %s", path); 
     else return errno;
+  }
   qpms_errno_t retval = 
     qpms_read_scuff_tmatrix(f, bs, n, freqs, freqs_su, tmatrices_array, tmdata);
 
@@ -519,7 +521,7 @@ complex double *qpms_mie_coefficients_reflection(
   qpms_l_t lMax = bspec->lMax;
   complex double x_i = k_i * a;
   complex double x_e = k_e * a;
-  complex double m = k_i / k_e;
+  // complex double m = k_i / k_e;
   complex double eta_inv_i = k_i / mu_i;
   complex double eta_inv_e = k_e / mu_e;
 
@@ -567,7 +569,6 @@ qpms_errno_t qpms_tmatrix_spherical_fill(qpms_tmatrix_t *t, ///< T-matrix whose
 		complex double mu_i, ///< Relative permeability of the sphere material.
 		complex double mu_e ///< Relative permeability of the surrounding medium.
 		) {
-  qpms_l_t lMax = t->spec->lMax;
   complex double *miecoeffs = qpms_mie_coefficients_reflection(NULL, t->spec, a, k_i, k_e, mu_i, mu_e,
     QPMS_BESSEL_REGULAR, QPMS_HANKEL_PLUS);
   memset(t->m, 0, SQ(t->spec->n)*sizeof(complex double));
@@ -691,6 +692,91 @@ static double tmatrix_axialsym_integrand(double theta, void *param) {
   return p->realpart ? creal(res) : cimag(res);
 }
 
+
+
+long qpms_axsym_tmatrix_integration_nthreads_default = 4;
+long qpms_axsym_tmatrix_integration_nthreads_override = 0;
+
+void qpms_axsym_tmatrix_integration_set_nthreads(long n) {
+    qpms_axsym_tmatrix_integration_nthreads_override = n;
+}
+
+struct qpms_tmatrix_axialsym_fill_integration_thread_arg {
+  complex double *Q, *R;
+  const qpms_vswf_set_spec_t *bspecQR;
+  double epsrel, epsabs;
+  size_t intlimit;
+  qpms_arc_function_t f;
+  const struct tmatrix_axialsym_integral_param_t *p_template; // Thread must copy this and set its own realpart and btype fields.
+  size_t *i2start_ptr;
+  pthread_mutex_t *i2start_mutex;
+};
+
+static void * qpms_tmatrix_axialsym_fill_integration_thread(void *arg) {
+  const struct qpms_tmatrix_axialsym_fill_integration_thread_arg *a = arg;
+  struct tmatrix_axialsym_integral_param_t p = *a->p_template;
+  const gsl_function f = {tmatrix_axialsym_integrand, (void *) &p};
+  gsl_integration_workspace *w = gsl_integration_workspace_alloc(a->intlimit);
+
+  while(1) {
+    QPMS_ENSURE_SUCCESS(pthread_mutex_lock(a->i2start_mutex));
+    if(*(a->i2start_ptr) >= a->bspecQR->n) {// Everything already done, leave thread
+      QPMS_ENSURE_SUCCESS(pthread_mutex_unlock(a->i2start_mutex));
+      break;
+    }
+    const size_t i2 = *(a->i2start_ptr);
+    ++*(a->i2start_ptr);
+    QPMS_ENSURE_SUCCESS(pthread_mutex_unlock(a->i2start_mutex));
+    for(size_t i1 = 0; i1 < a->bspecQR->n; ++i1) {
+      // NOTE that the Q', R' matrices are !TRANSPOSED! here because of zgetrs
+      const size_t iQR = i1 + i2 * a->bspecQR->n;
+      qpms_m_t m1, m2;
+      qpms_l_t l1, l2;
+      qpms_vswf_type_t t1, t2;
+      const qpms_uvswfi_t u1 = a->bspecQR->ilist[i1], u2 = a->bspecQR->ilist[i2];
+      QPMS_ENSURE_SUCCESS(qpms_uvswfi2tmn(u1, &t1, &m1, &l1));
+      QPMS_ENSURE_SUCCESS(qpms_uvswfi2tmn(u2, &t2, &m2, &l2));
+      if (m1 + m2) {
+        a->Q[iQR] = 0;
+        a->R[iQR] = 0;
+      } else {
+        p.m = m1;
+        p.l = l1; p.l_in = l2;
+        p.t = t1; p.t_in = t2;
+
+        double result; 
+        double abserr; // gsl_integration_qag() needs this; thrown away for now
+        // Re(R')
+        p.btype = QPMS_BESSEL_REGULAR;
+        p.realpart = true;
+        QPMS_ENSURE_SUCCESS(gsl_integration_qag(&f, 0, M_PI, a->epsabs, a->epsrel,
+              a->intlimit, 2, w, &result, &abserr));
+        a->R[iQR] = result;
+        
+        // Im(R')
+        p.realpart = false;
+        QPMS_ENSURE_SUCCESS(gsl_integration_qag(&f, 0, M_PI, a->epsabs, a->epsrel,
+              a->intlimit, 2, w, &result, &abserr));
+        a->R[iQR] += I*result;
+
+        // Re(Q')
+        p.btype = QPMS_HANKEL_PLUS;
+        p.realpart = true;
+        QPMS_ENSURE_SUCCESS(gsl_integration_qag(&f, 0, M_PI, a->epsabs, a->epsrel,
+              a->intlimit, 2, w, &result, &abserr));
+        a->Q[iQR] = result;
+
+        // Im(Q')
+        p.realpart = false;
+        QPMS_ENSURE_SUCCESS(gsl_integration_qag(&f, 0, M_PI, a->epsabs, a->epsrel,
+              a->intlimit, 2, w, &result, &abserr));
+        a->Q[iQR] += I*result;
+      }
+    }
+  }
+  gsl_integration_workspace_free(w);
+}
+
 qpms_errno_t qpms_tmatrix_axialsym_fill(
 		qpms_tmatrix_t *t, complex double omega, qpms_epsmu_t outside,
 		qpms_epsmu_t inside,qpms_arc_function_t shape, qpms_l_t lMaxQR)
@@ -704,7 +790,6 @@ qpms_errno_t qpms_tmatrix_axialsym_fill(
   p.z_in = qpms_waveimpedance(inside);
   p.f = shape;
   p.bspec = bspec;
-  const gsl_function f = {tmatrix_axialsym_integrand, (void *) &p};
 
   if (lMaxQR < bspec->lMax) lMaxQR = bspec->lMax;
   qpms_vswf_set_spec_t *bspecQR = qpms_vswf_set_spec_from_lMax(lMaxQR, bspec->norm);
@@ -719,55 +804,43 @@ qpms_errno_t qpms_tmatrix_axialsym_fill(
   const size_t intlimit = 1024;
   const double epsrel = TMATRIX_AXIALSYM_INTEGRAL_EPSREL;
   const double epsabs = TMATRIX_AXIALSYM_INTEGRAL_EPSABS;
-  gsl_integration_workspace *w = gsl_integration_workspace_alloc(intlimit);
-  for(size_t i1 = 0; i1 < bspecQR->n; ++i1)
-    for(size_t i2 = 0; i2 < bspecQR->n; ++i2) {
-      // NOTE that the Q', R' matrices are !TRANSPOSED! here because of zgetrs
-      const size_t iQR = i1 + i2 * bspecQR->n;
-      qpms_m_t m1, m2;
-      qpms_l_t l1, l2;
-      qpms_vswf_type_t t1, t2;
-      const qpms_uvswfi_t u1 = bspecQR->ilist[i1], u2 = bspecQR->ilist[i2];
-      QPMS_ENSURE_SUCCESS(qpms_uvswfi2tmn(u1, &t1, &m1, &l1));
-      QPMS_ENSURE_SUCCESS(qpms_uvswfi2tmn(u2, &t2, &m2, &l2));
-      if (m1 + m2) {
-        Q[iQR] = 0;
-        R[iQR] = 0;
-      } else {
-        p.m = m1;
-        p.l = l1; p.l_in = l2;
-        p.t = t1; p.t_in = t2;
 
-        double result; 
-        double abserr; // gsl_integration_qag() needs this; thrown away for now
-        // Re(R')
-        p.btype = QPMS_BESSEL_REGULAR;
-        p.realpart = true;
-        QPMS_ENSURE_SUCCESS(gsl_integration_qag(&f, 0, M_PI, epsabs, epsrel,
-              intlimit, 2, w, &result, &abserr));
-        R[iQR] = result;
-        
-        // Im(R')
-        p.realpart = false;
-        QPMS_ENSURE_SUCCESS(gsl_integration_qag(&f, 0, M_PI, epsabs, epsrel,
-              intlimit, 2, w, &result, &abserr));
-        R[iQR] += I*result;
+  //==== multi-threaded integration begin
+  size_t i2start = 0;
+  pthread_mutex_t i2start_mutex;
+  QPMS_ENSURE_SUCCESS(pthread_mutex_init(&i2start_mutex, NULL));
+  const struct qpms_tmatrix_axialsym_fill_integration_thread_arg
+    arg = {Q, R, bspecQR, epsrel, epsabs, intlimit, shape, &p, &i2start, &i2start_mutex};
 
-        // Re(Q')
-        p.btype = QPMS_HANKEL_PLUS;
-        p.realpart = true;
-        QPMS_ENSURE_SUCCESS(gsl_integration_qag(&f, 0, M_PI, epsabs, epsrel,
-              intlimit, 2, w, &result, &abserr));
-        Q[iQR] = result;
-
-        // Im(Q')
-        p.realpart = false;
-        QPMS_ENSURE_SUCCESS(gsl_integration_qag(&f, 0, M_PI, epsabs, epsrel,
-              intlimit, 2, w, &result, &abserr));
-        Q[iQR] += I*result;
-      }
+  // FIXME THIS IS NOT PORTABLE (_SC_NPROCESSORS_ONLN is not a standard POSIX thing)
+  long nthreads;
+  if (qpms_axsym_tmatrix_integration_nthreads_override > 0) {
+    nthreads = qpms_axsym_tmatrix_integration_nthreads_override;
+    QPMS_DEBUG(QPMS_DBGMSG_THREADS, "Using overriding value of %ld thread(s).",
+        nthreads);
+  } else {
+    nthreads = sysconf(_SC_NPROCESSORS_ONLN);
+    if (nthreads < 1) {
+      QPMS_DEBUG(QPMS_DBGMSG_THREADS, "_SC_NPROCESSORS_ONLN returned %ld, using %ld thread(s) instead.",
+         nthreads, qpms_axsym_tmatrix_integration_nthreads_default);
+      nthreads = qpms_axsym_tmatrix_integration_nthreads_default;
+    } else {
+      QPMS_DEBUG(QPMS_DBGMSG_THREADS, "_SC_NRPOCESSORS_ONLN returned %ld.", nthreads);
     }
-  gsl_integration_workspace_free(w);
+  }
+  nthreads = MIN(nthreads, bspec->n); // don't make redundant threads 
+  pthread_t thread_ids[nthreads];
+  for(long thi = 0; thi < nthreads; ++thi)
+    QPMS_ENSURE_SUCCESS(pthread_create(thread_ids + thi, NULL,
+      qpms_tmatrix_axialsym_fill_integration_thread,
+      (void *) &arg));
+  for(long thi = 0; thi < nthreads; ++thi) {
+    void *retval;
+    QPMS_ENSURE_SUCCESS(pthread_join(thread_ids[thi], &retval));
+  }
+  QPMS_ENSURE_SUCCESS(pthread_mutex_destroy(&i2start_mutex));
+  //===== multi-threaded integration end
+
 
   // Compute T-matrix; maybe it would be better solved with some sparse matrix mechanism,
   // but fukkit.
@@ -777,7 +850,6 @@ qpms_errno_t qpms_tmatrix_axialsym_fill(
   QPMS_ENSURE_SUCCESS(LAPACKE_zgetrf(LAPACK_ROW_MAJOR, n, n, Q, n, pivot));
   // Solve Q'^T X = R'^T where X will be -T^T
   // Note that Q'^T, R'^T are already (transposed) in Q, R.
-  const complex double minus1 = -1;
   QPMS_ENSURE_SUCCESS(LAPACKE_zgetrs(LAPACK_ROW_MAJOR, 'N', n, n /*nrhs*/,
         Q, n, pivot, R, n));
   // R now contains -T^T.