From 7e74cb100c4b87e558a992b88a0c0b7d092c46f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ne=C4=8Dada?= <marek@necada.org>
Date: Mon, 30 Sep 2019 14:37:28 +0300
Subject: [PATCH] Experimental speedup by earlier cutoff for Ewald summation.

TODO check whether PGen is properly destroyed


Former-commit-id: db2c64292b9c1bd80e97255bf71e570385b409a4
---
 qpms/CMakeLists.txt |  5 +--
 qpms/ewald.c        | 80 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/qpms/CMakeLists.txt b/qpms/CMakeLists.txt
index d335f93..575aca8 100644
--- a/qpms/CMakeLists.txt
+++ b/qpms/CMakeLists.txt
@@ -3,8 +3,6 @@ find_package(GSL 2.0 REQUIRED)
 find_package(BLAS REQUIRED)
 find_package(LAPACK REQUIRED)
 
-add_definitions(-DLATTICESUMS32)
-add_definitions(-DQPMS_VECTORS_NICE_TRANSFORMATIONS)
 
 #includes
 set (DIRS ${GSL_INCLUDE_DIRS} ${GSLCBLAS_INCLUDE_DIRS})
@@ -29,6 +27,9 @@ target_link_libraries (qpms
 target_include_directories (qpms PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 target_compile_options(qpms PRIVATE -Wall -Wno-return-type)
+target_compile_definitions(qpms PRIVATE LATTICESUMS32 QPMS_VECTORS_NICE_TRANSFORMATIONS
+	EWALD_AUTO_CUTOFF
+	)
 
 install(TARGETS qpms
 	LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/qpms/ewald.c b/qpms/ewald.c
index 7720a33..3d08de0 100644
--- a/qpms/ewald.c
+++ b/qpms/ewald.c
@@ -29,6 +29,8 @@
 #define M_SQRTPI 1.7724538509055160272981674833411452
 #endif
 
+#define MAX(x,y) ((x)<(y)?(y):(x))
+#define MIN(x,y) ((x)<(y)?(x):(y))
 
 
 // sloppy implementation of factorial
@@ -221,6 +223,7 @@ int ewald3_21_xy_sigma_long (
   assert((latdim & LAT_XYONLY) && (latdim & SPACE3D));
   assert((latdim & LAT1D) || (latdim & LAT2D));
   const qpms_y_t nelem_sc = c->nelem_sc;
+  assert(nelem_sc > 0);
   const qpms_l_t lMax = c->lMax;
   
   // Manual init of the ewald summation targets
@@ -232,6 +235,23 @@ int ewald3_21_xy_sigma_long (
     memset(err, 0, nelem_sc * sizeof(double));
   }
 
+#ifdef EWALD_AUTO_CUTOFF
+  /* 
+   * Experimental: stop the evaluation if the relative contribution of the "last layer"
+   * is below DBL_EPSILON to speed up the calculation. 
+   * This obviously can currently work only for "suitable PGen"s that generate points in
+   * layers with increasing distance from the origin.
+   * Currently, the "layers" are chunks of 12 * lw + 1 points, where w is an
+   * increasing integer index (starting from 1).
+   * TODO: define the layers (optionally) in the PGen metadata and switch this feature on
+   * during run time instead of using this EWALD_AUTO_CUTOFF macro.
+   */
+  size_t lw = 1; // "Layer" index.
+  size_t li = 0; // Counter of points inside the current "layer".
+  double lsum, lsum_c; // Layer contribution magnitude sum + kahan compensation
+  kahaninit(&lsum, &lsum_c);
+#endif
+
   const complex double commonfac = 1/(k*k*unitcell_volume); // used in the very end (CFC)
   if (k_is_real)
     assert(creal(commonfac) > 0);
@@ -317,12 +337,34 @@ int ewald3_21_xy_sigma_long (
         }
         jsum *= phasefac * factor1d; // PFC
         ckahanadd(target + y, target_c + y, jsum);
+#ifdef EWALD_AUTO_CUTOFF
+        kahanadd(&lsum, &lsum_c, cabs(jsum));
+#endif
         if(err) kahanadd(err + y, err_c + y, jsum_err);
       }
 #ifndef NDEBUG
     rbeta_pq_prev = rbeta_pq;
+#endif
+#ifdef EWALD_AUTO_CUTOFF
+    ++li;
+    double cursum_min = INFINITY;
+    for (qpms_y_t y = 0; y < nelem_sc; ++y) {
+      const double a = cabs(target[y]);
+      if (a) // Skip possible symmetry-induced zeros; TODO this might need to be done in completely different way, perhaps on per-y basis
+        cursum_min = MIN(cursum_min, a);
+    }
+    if (li >= 12 * lw + 1) {
+      if(lsum < DBL_EPSILON * cursum_min)
+        goto ewald3_21_xy_sigma_long_end_point_loop;
+      kahaninit(&lsum, &lsum_c);
+      li = 0;
+      ++lw;
+    }
 #endif
   } // END POINT LOOP
+#ifdef EWALD_AUTO_CUTOFF
+ewald3_21_xy_sigma_long_end_point_loop:
+#endif
  
   free(err_c);
   free(target_c);
@@ -606,8 +648,17 @@ int ewald3_sigma_short(
     err_c = calloc(nelem_sc, sizeof(double));
     memset(err, 0, nelem_sc * sizeof(double));
   }
-  
 
+#ifdef EWALD_AUTO_CUTOFF
+  /* 
+   * Experimental; see the notes at the first #ifdef EWALD_AUTO_CUTOFF in this file.
+   */
+  size_t lw = 1; // "Layer" index.
+  size_t li = 0; // Counter of points inside the current "layer".
+  double lsum, lsum_c; // Layer contribution magnitude sum + kahan compensation
+  kahaninit(&lsum, &lsum_c);
+#endif
+  
   PGenSphReturnData pgen_retdata;
 #ifndef NDEBUG
   double r_pq_shifted_prev;
@@ -705,15 +756,38 @@ int ewald3_sigma_short(
         if(err)
           kahanadd(err + y, err_c + y, cabs(leg * (prefacn / I) * R_pq_pown
               * interr[n])); // TODO include also other errors
-        ckahanadd(target + y, target_c + y,
-            prefacn * R_pq_pown * leg * cintres[n] * e_beta_Rpq * e_imf * min1pow_m_neg(m));
+        complex double thesummand = prefacn * R_pq_pown * leg * cintres[n] * e_beta_Rpq * e_imf * min1pow_m_neg(m);
+        ckahanadd(target + y, target_c + y, thesummand);
+#ifdef EWALD_AUTO_CUTOFF
+        kahanadd(&lsum, &lsum_c, cabs(thesummand));
+#endif
       }
 
     }
 #ifndef NDEBUG
     r_pq_shifted_prev = r_pq_shifted;
+#endif
+#ifdef EWALD_AUTO_CUTOFF
+    ++li;
+    double cursum_min = INFINITY;
+    for (qpms_y_t y = 0; y < nelem_sc; ++y) {
+      const double a = cabs(target[y]);
+      if (a) // Skip possible symmetry-induced zeros; TODO this might need to be done in completely different way, perhaps on per-y basis
+        cursum_min = MIN(cursum_min, a);
+    }
+    if (li >= 12 * lw + 1) {
+      if(lsum < DBL_EPSILON * cursum_min)
+        goto ewald3_sigma_short_end_point_loop;
+      kahaninit(&lsum, &lsum_c);
+      li = 0;
+      ++lw;
+    }
 #endif
   }
+  //END POINT LOOP
+#ifdef EWALD_AUTO_CUTOFF
+ewald3_sigma_short_end_point_loop:
+#endif
 
   gsl_integration_workspace_free(workspace);
   if(err) free(err_c);