diff --git a/qpms/apps/2dlattice_ewald.c b/qpms/apps/2dlattice_ewald.c
new file mode 100644
index 0000000..48fd29b
--- /dev/null
+++ b/qpms/apps/2dlattice_ewald.c
@@ -0,0 +1,154 @@
+// c99 -o ew_gen_kin -Wall -I ../.. -O2 -ggdb -DQPMS_VECTORS_NICE_TRANSFORMATIONS -DLATTICESUMS32 2dlattice_ewald.c ../translations.c ../ewald.c ../ewaldsf.c ../gaunt.c ../lattices2d.c ../latticegens.c -lgsl -lm -lblas 
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <qpms/translations.h>
+#include <qpms/lattices.h>
+#include <gsl/gsl_const_mksa.h>
+
+// Command line order:
+// outfile b1.x b1.y b2.x b2.y lMax scuffomega refindex npart part0.x part0.y [part1.x part1.y [...]]
+//
+// Standard input (per line):
+// k.x k.y
+//
+// Output data format (line):
+//
+
+
+#define MAXKCOUNT 200 // 200 // serves as klist default buffer size 
+//#define KMINCOEFF 0.783 //0.9783 // 0.783 // not used if KSTDIN defined
+//#define KMAXCOEFF 1.217 //1.0217 // 1.217 // not used if KSTDIN defined
+#define KLAYERS 20
+#define RLAYERS 20
+
+const double s3 = 1.732050807568877293527446341505872366942805253810380628055;
+
+// IMPORTANT: lattice properties here
+const qpms_y_t lMax = 3;
+const double REFINDEX = 1.52;
+const double LATTICE_H = 576e-9;
+static const double SCUFF_OMEGAUNIT = 3e14;
+static const double hbar = GSL_CONST_MKSA_PLANCKS_CONSTANT_HBAR;
+static const double eV = GSL_CONST_MKSA_ELECTRON_CHARGE;
+static const double c0 = GSL_CONST_MKSA_SPEED_OF_LIGHT;
+
+int main (int argc, char **argv) {
+  //const double LATTICE_A = s3*LATTICE_H;
+  //const double INVLATTICE_A = 4*M_PI / s3 / LATTICE_A;
+
+  if (argc < 12) abort();
+
+  char *outfile = argv[1];
+  char *errfile = NULL; // Filename for the error estimate output; NOT USED
+  cart2_t b1 = {strtod(argv[2], NULL), strtod(argv[3], NULL)},
+          b2 = {strtod(argv[4], NULL), strtod(argv[5], NULL)};
+  const qpms_l_t lMax = strtol(argv[6], NULL, 10); assert(lMax>0);
+  const double scuffomega = strtod(argv[7], NULL);
+  const double refindex = strtod(argv[8], NULL);
+  const int npart = strtol(argv[9], NULL, 10);  assert(npart>0);
+  assert(argc >= 2*npart + 10);
+  assert(npart > 0);
+  cart2_t part_positions[npart];
+  for(int p = 0; p < npart; ++p) {
+    part_positions[p].x = strtod(argv[10+2*p], NULL);
+    part_positions[p].y = strtod(argv[10+2*p+1], NULL);
+  }
+
+//#ifdef KSTDIN
+  size_t kcount = 0;
+  size_t klist_capacity = MAXKCOUNT;
+  cart2_t *klist = malloc(sizeof(cart2_t) * klist_capacity);
+  while (scanf("%lf %lf", &(klist[kcount].x), &(klist[kcount].y)) == 2) {
+    ++kcount;
+    if(kcount >= klist_capacity) {
+      klist_capacity *= 2;
+      klist = realloc(klist, sizeof(cart2_t) * klist_capacity);
+      if (klist == NULL) abort();
+    }
+  }
+//#else
+#if 0
+  cart2_t klist[MAXKCOUNT];
+  int kcount = MAXKCOUNT;
+  for (int i = 0; i <  kcount; ++i) { // TODO this should depend on orientation...
+    klist[i].x = 0;
+    klist[i].y = (4.* M_PI / 3. / LATTICE_A) * (KMINCOEFF + (KMAXCOEFF-KMINCOEFF)/kcount*i);
+  }
+#endif
+
+  const double unitcell_area = l2d_unitcell_area(b1, b2);
+  l2d_reduceBasis(b1, b2, &b1, &b2);
+  
+  // TODO more clever way of determining the cutoff
+  const double a = sqrt(unitcell_area); // N.B. different meaning than before
+  const double maxR = 25 * a;
+  const double maxK = 25 * 2*M_PI/a;
+
+  qpms_trans_calculator *c = qpms_trans_calculator_init(lMax, QPMS_NORMALISATION_POWER_CS); // vai POWER_CS?
+
+  FILE *out = fopen(outfile, "w");
+  FILE *err = NULL;
+  if (errfile)
+    err = fopen(errfile, "w");
+
+  {
+    const double omega = scuffomega * SCUFF_OMEGAUNIT;
+    const double EeV = omega * hbar / eV;
+    const double k0_vac = omega / c0;
+    const double k0_eff  = k0_vac * refindex;
+    const double eta = 5.224/a; // FIXME quite arbitrary, but this one should work
+
+    // indices : destpart (A/B-particle), srcpart (A/B-particle), coeff type (A/B- type), desty, srcy
+    complex double W[npart][npart][2][c->nelem][c->nelem];
+    double Werr[npart][npart][npart][c->nelem][c->nelem];
+
+    for (size_t ki = 0; ki < kcount; ++ki) {
+      cart2_t beta = klist[ki];
+      memset(W, 0, sizeof(W));
+      if(err)
+        memset(Werr, 0, sizeof(Werr));
+
+      const ptrdiff_t deststride = &(W[0][0][0][1][0]) - &(W[0][0][0][0][0]);
+      const ptrdiff_t srcstride = &(W[0][0][0][0][1]) - &(W[0][0][0][0][0]);
+      assert (srcstride == 1 && deststride == c->nelem);
+
+      for (size_t ps = 0; ps < npart; ++ps) for (size_t pd = 0; pd < npart; ++pd) 
+        // TODO optimize (calculate only once for each particle shift; especially if pd == ps)
+        qpms_trans_calculator_get_AB_arrays_e32(c,
+            &(W[pd][ps][0][0][0]), err ? &(Werr[pd][ps][0][0][0]) : NULL, // Adest, Aerr,
+            &(W[pd][ps][1][0][0]), err ? &(Werr[pd][ps][1][0][0]) : NULL, // Bdest, Berr,
+            deststride, srcstride,
+            eta, k0_eff, b1, b2,
+            beta,
+            cart2_substract(part_positions[pd], part_positions[ps]),  // CHECKSIGN
+            maxR, maxK 
+            );
+        // TODO CHECK B<-A vs. A<-B relation
+
+      fprintf(out, "%.16g\t%.16g\t%.16g\t%.16g\t%.16g\t",
+          scuffomega, EeV, k0_eff, beta.x, beta.y);
+      if(err) fprintf(err, "%.16g\t%.16g\t%16g\t%.16g\t%.16g\t",
+          scuffomega, EeV, k0_eff, beta.x, beta.y);
+      size_t totalelems = sizeof(W) / sizeof(complex double);
+      for (size_t i = 0; i < totalelems; ++i) {
+        complex double w = ((complex double *)W)[i];
+        fprintf(out, "%.16g\t%.16g\t", creal(w), cimag(w));
+        if (err) 
+          fprintf(err, "%.3g\t", ((double *)Werr)[i]);
+      }
+      fputc('\n', out);
+      if(err) fputc('\n', err);
+    }
+  }
+
+  fclose(out);
+  if(err) fclose(err);
+
+//#ifdef KSTDIN
+  free(klist);
+//#endif
+
+  qpms_trans_calculator_free(c);
+
+}
diff --git a/qpms/translations.c b/qpms/translations.c
index 816bbd6..ee0f0bf 100644
--- a/qpms/translations.c
+++ b/qpms/translations.c
@@ -1154,6 +1154,125 @@ int qpms_trans_calculator_get_AB_arrays(const qpms_trans_calculator *c,
 }
 
 
+
+#ifdef LATTICESUMS31
+int qpms_trans_calculator_get_AB_arrays_e31z_both_points_and_shift(const qpms_trans_calculator *c,
+    complex double * const Adest, double * const Aerr,
+    complex double * const Bdest, double * const Berr,
+    const ptrdiff_t deststride, const ptrdiff_t srcstride,
+    /* qpms_bessel_t J*/ // assume QPMS_HANKEL_PLUS
+    const double eta, const double k, const double unitcell_area,
+    const size_t nRpoints, const cart2_t *Rpoints, // n.b. can't contain 0; TODO automatic recognition and skip
+    const size_t nKpoints, const cart2_t *Kpoints,
+    const double beta,//DIFF21
+    const double particle_shift//DIFF21
+    )
+{
+
+  const qpms_y_t nelem2_sc = qpms_lMax2nelem_sc(c->e32c->lMax);
+  //const qpms_y_t nelem = qpms_lMax2nelem(c->lMax);
+  const bool doerr = Aerr || Berr;
+  const bool do_sigma0 = (particle_shift == 0)//DIFF21((particle_shift.x == 0) && (particle_shift.y == 0)); // FIXME ignoring the case where particle_shift equals to lattice vector
+
+  complex double *sigmas_short = malloc(sizeof(complex double)*nelem2_sc);
+  complex double *sigmas_long = malloc(sizeof(complex double)*nelem2_sc);
+  complex double *sigmas_total = malloc(sizeof(complex double)*nelem2_sc);
+  double *serr_short, *serr_long, *serr_total;
+  if(doerr) {
+    serr_short = malloc(sizeof(double)*nelem2_sc);
+    serr_long = malloc(sizeof(double)*nelem2_sc);
+    serr_total = malloc(sizeof(double)*nelem2_sc);
+  } else serr_short = serr_long = serr_total = NULL;
+
+  int retval;
+  retval = ewald31z_sigma_long_points_and_shift(sigmas_long, serr_long, //DIFF21
+      c->e32c, eta, k, unitcell_area, nKpoints, Kpoints, beta, particle_shift);
+  if (retval) abort();
+
+  retval = ewald31z_sigma_short_points_and_shift(sigmas_short, serr_short, //DIFF21
+      c->e32c, eta, k, nRpoints, Rpoints, beta, particle_shift);
+  if (retval) abort();
+
+  for(qpms_y_t y = 0; y < nelem2_sc; ++y)
+    sigmas_total[y] = sigmas_short[y] + sigmas_long[y];
+  if (doerr) for(qpms_y_t y = 0; y < nelem2_sc; ++y)
+    serr_total[y]  = serr_short[y] + serr_long[y];
+
+  complex double sigma0 = 0; double sigma0_err = 0;
+  if (do_sigma0) {
+    retval = ewald31z_sigma0(&sigma0, &sigma0_err, c->e32c, eta, k);//DIFF21
+    if(retval) abort();
+    const qpms_l_t y = qpms_mn2y_sc(0,0);
+    sigmas_total[y] += sigma0;
+    if(doerr) serr_total[y] += sigma0_err;
+  }
+  
+  switch(qpms_normalisation_t_normonly(c->normalisation)) {
+    case QPMS_NORMALISATION_TAYLOR:
+    case QPMS_NORMALISATION_POWER:
+    case QPMS_NORMALISATION_NONE:
+      {
+        ptrdiff_t desti = 0, srci = 0;
+        for (qpms_l_t n = 1; n <= c->lMax; ++n) for (qpms_m_t m = -n; m <= n; ++m) {
+          for (qpms_l_t nu = 1; nu <= c->lMax; ++nu) for (qpms_m_t mu = -nu; mu <= nu; ++mu){
+            const size_t i = qpms_trans_calculator_index_mnmunu(c, m, n, mu, nu);
+            const size_t qmax = c->A_multipliers[i+1] - c->A_multipliers[i] - 1;
+            complex double Asum, Asumc; ckahaninit(&Asum, &Asumc);
+            double Asumerr, Asumerrc; if(Aerr) kahaninit(&Asumerr, &Asumerrc);
+            
+            const qpms_m_t mu_m  = mu - m;
+            // TODO skip if ... (N.B. skip will be different for 31z and 32)
+            for(qpms_l_t q = 0; q <= qmax; ++q) {
+              const qpms_l_t p = n + nu - 2*q;
+              const qpms_y_t y_sc = qpms_mn2y_sc(mu_m, p); 
+              const complex double multiplier = c->A_multipliers[i][q];
+              complex double sigma = sigmas_total[y_sc];
+              ckahanadd(&Asum, &Asumc, multiplier * sigma);
+              if (Aerr) kahanadd(&Asumerr, &Asumerrc, multiplier * serr_total[y_sc]);
+            }
+
+            *(Adest + deststride * desti + srcstride * srci) = Asum;
+            if (Aerr) *(Aerr + deststride * desti + srcstride * srci) = Asumerr;
+            
+            // TODO skip if ...
+            complex double Bsum, Bsumc; ckahaninit(&Bsum, &Bsumc);
+            double Bsumerr, Bsumerrc; if(Berr) kahaninit(&Bsumerr, &Bsumerrc);
+             for(qpms_l_t q = 0; q <= qmax; ++q) {
+              const qpms_l_t p_ = n + nu - 2*q + 1;
+              const qpms_y_t y_sc = qpms_mn2y_sc(mu_m, p_); 
+              const complex double multiplier = c->B_multipliers[i][q-BQ_OFFSET];
+              complex double sigma = sigmas_total[y_sc];
+              ckahanadd(&Bsum, &Bsumc, multiplier * sigma);
+              if (Berr) kahanadd(&Bsumerr, &Bsumerrc, multiplier * serr_total[y_sc]);
+            }
+
+            *(Bdest + deststride * desti + srcstride * srci) = Bsum;
+            if (Berr) *(Berr + deststride * desti + srcstride * srci) = Bsumerr;
+
+            ++srci;
+          }
+          ++desti;
+          srci = 0;
+        }
+      }
+      break;
+    default:
+      abort();
+  }
+
+  free(sigmas_short);
+  free(sigmas_long);
+  free(sigmas_total);
+  if(doerr) {
+    free(serr_short);
+    free(serr_long);
+    free(serr_total);
+  }
+  return 0;
+}
+#endif LATTICESUMS_31
+
+
 #ifdef LATTICESUMS32
 
 int qpms_trans_calculator_get_AB_arrays_e32_both_points_and_shift(const qpms_trans_calculator *c,
@@ -1271,48 +1390,28 @@ int qpms_trans_calculator_get_AB_arrays_e32_both_points_and_shift(const qpms_tra
   return 0;
 }
 
-#if 0
-int qpms_trans_calculator_e32_long_points_and_shift(const qpms_trans_calculator *c,
-                complex double *Adest_long, double *Aerr_long,
-                complex double *Bdest_long, double *Berr_long,
-                double eta, double k, double unitcell_area,
-                size_t npoints, const cart2_t *Kpoints,
-                cart2_t beta,
-                cart2_t particle_shift
-                )
-{
-  
-}
 
-int qpms_trans_calculator_e32_short_points_and_shift(const qpms_trans_calculator *c,
-                complex double *Adest_short, double *Aerr_short,
-                complex double *Bdest_short, double *Berr_short,
-                double eta, double k,
-                size_t npoints, const cart2_t *Rpoints,
-                cart2_t beta,
-                cart2_t particle_shift
-                };
-#endif // 0
-#endif // LATTICESUMS32
-
-#ifdef LATTICESUMS31
-int qpms_trans_calculator_get_AB_arrays_e31z_both_points_and_shift(const qpms_trans_calculator *c,
+// N.B. alternative point generation strategy toggled by macro GEN_RSHIFTEDPOINTS
+// and GEN_KSHIFTEDPOINTS.
+// The results should be the same. The performance can slightly differ (especially
+// if some optimizations in the point generators are implemented.)
+int qpms_trans_calculator_get_AB_arrays_e32(const qpms_trans_calculator *c,
     complex double * const Adest, double * const Aerr,
     complex double * const Bdest, double * const Berr,
     const ptrdiff_t deststride, const ptrdiff_t srcstride,
     /* qpms_bessel_t J*/ // assume QPMS_HANKEL_PLUS
-    const double eta, const double k, const double unitcell_area,
-    const size_t nRpoints, const cart2_t *Rpoints, // n.b. can't contain 0; TODO automatic recognition and skip
-    const size_t nKpoints, const cart2_t *Kpoints,
-    const double beta,//DIFF21
-    const double particle_shift//DIFF21
+    const double eta, const double k,
+    const cart2_t b1, const cart2_t b2,
+    const cart2_t beta,
+    const cart2_t particle_shift,
+    double maxR, double maxK
     )
 {
 
   const qpms_y_t nelem2_sc = qpms_lMax2nelem_sc(c->e32c->lMax);
   //const qpms_y_t nelem = qpms_lMax2nelem(c->lMax);
   const bool doerr = Aerr || Berr;
-  const bool do_sigma0 = (particle_shift == 0)//DIFF21((particle_shift.x == 0) && (particle_shift.y == 0)); // FIXME ignoring the case where particle_shift equals to lattice vector
+  const bool do_sigma0 = ((particle_shift.x == 0) && (particle_shift.y == 0)); // FIXME ignoring the case where particle_shift equals to lattice vector
 
   complex double *sigmas_short = malloc(sizeof(complex double)*nelem2_sc);
   complex double *sigmas_long = malloc(sizeof(complex double)*nelem2_sc);
@@ -1324,13 +1423,48 @@ int qpms_trans_calculator_get_AB_arrays_e31z_both_points_and_shift(const qpms_tr
     serr_total = malloc(sizeof(double)*nelem2_sc);
   } else serr_short = serr_long = serr_total = NULL;
 
+  const double unitcell_area = l2d_unitcell_area(b1, b2);
+  cart2_t rb1, rb2; // reciprocal basis
+  if (QPMS_SUCCESS != l2d_reciprocalBasis2pi(b1, b2, &rb1, &rb2)) abort();
+
+  PGen Rgen = PGen_xyWeb_new(b1, b2, BASIS_RTOL, 
+#ifdef GEN_RSHIFTEDPOINTS
+      cart2_scale(-1 /*CHECKSIGN*/, particle_shift),
+#else
+      CART2_ZERO,
+#endif
+      0, !do_sigma0, maxR, false);
+  PGen Kgen = PGen_xyWeb_new(rb1, rb2, BASIS_RTOL,
+#ifdef GEN_KSHIFTEDPOINTS
+      beta,
+#else
+     CART2_ZERO,
+#endif
+     0, true, maxK, false);
+
   int retval;
-  retval = ewald31z_sigma_long_points_and_shift(sigmas_long, serr_long, //DIFF21
-      c->e32c, eta, k, unitcell_area, nKpoints, Kpoints, beta, particle_shift);
+  //retval = ewald32_sigma_long_points_and_shift(sigmas_long, serr_long,
+  //    c->e32c, eta, k, unitcell_area, nKpoints, Kpoints, beta, particle_shift);
+  retval = ewald3_sigma_long(sigmas_long, serr_long, c->e32c, eta, k, 
+      unitcell_area, LAT_2D_IN_3D_XYONLY, &Kgen,
+#ifdef GEN_KSHIFTEDPOINTS
+      true,
+#else
+      false,
+#endif
+      cart22cart3xy(beta), cart22cart3xy(particle_shift));
   if (retval) abort();
 
-  retval = ewald31z_sigma_short_points_and_shift(sigmas_short, serr_short, //DIFF21
-      c->e32c, eta, k, nRpoints, Rpoints, beta, particle_shift);
+  //retval = ewald32_sigma_short_points_and_shift(sigmas_short, serr_short,
+  //    c->e32c, eta, k, nRpoints, Rpoints, beta, particle_shift);
+  retval = ewald3_sigma_short(sigmas_short, serr_short, c->e32c, eta, k,
+      LAT_2D_IN_3D_XYONLY, &Rgen, 
+#ifdef GEN_RSHIFTEDPOINTS
+      true,
+#else
+      false,
+#endif
+      cart22cart3xy(beta), cart22cart3xy(particle_shift));
   if (retval) abort();
 
   for(qpms_y_t y = 0; y < nelem2_sc; ++y)
@@ -1340,7 +1474,7 @@ int qpms_trans_calculator_get_AB_arrays_e31z_both_points_and_shift(const qpms_tr
 
   complex double sigma0 = 0; double sigma0_err = 0;
   if (do_sigma0) {
-    retval = ewald31z_sigma0(&sigma0, &sigma0_err, c->e32c, eta, k);//DIFF21
+    retval = ewald32_sigma0(&sigma0, &sigma0_err, c->e32c, eta, k);
     if(retval) abort();
     const qpms_l_t y = qpms_mn2y_sc(0,0);
     sigmas_total[y] += sigma0;
@@ -1361,7 +1495,7 @@ int qpms_trans_calculator_get_AB_arrays_e31z_both_points_and_shift(const qpms_tr
             double Asumerr, Asumerrc; if(Aerr) kahaninit(&Asumerr, &Asumerrc);
             
             const qpms_m_t mu_m  = mu - m;
-            // TODO skip if ... (N.B. skip will be different for 31z and 32)
+            // TODO skip if ...
             for(qpms_l_t q = 0; q <= qmax; ++q) {
               const qpms_l_t p = n + nu - 2*q;
               const qpms_y_t y_sc = qpms_mn2y_sc(mu_m, p); 
@@ -1410,6 +1544,31 @@ int qpms_trans_calculator_get_AB_arrays_e31z_both_points_and_shift(const qpms_tr
   }
   return 0;
 }
+
+#if 0
+int qpms_trans_calculator_e32_long_points_and_shift(const qpms_trans_calculator *c,
+                complex double *Adest_long, double *Aerr_long,
+                complex double *Bdest_long, double *Berr_long,
+                double eta, double k, double unitcell_area,
+                size_t npoints, const cart2_t *Kpoints,
+                cart2_t beta,
+                cart2_t particle_shift
+                )
+{
+  
+}
+
+int qpms_trans_calculator_e32_short_points_and_shift(const qpms_trans_calculator *c,
+                complex double *Adest_short, double *Aerr_short,
+                complex double *Bdest_short, double *Berr_short,
+                double eta, double k,
+                size_t npoints, const cart2_t *Rpoints,
+                cart2_t beta,
+                cart2_t particle_shift
+                };
+#endif // 0
+#endif // LATTICESUMS32
+
 #ifdef LATTICESUMS_OLD
 
 int qpms_trans_calculator_get_shortrange_AB_arrays_buf(const qpms_trans_calculator *c,
diff --git a/qpms/translations.h b/qpms/translations.h
index bbf13b0..a80fde7 100644
--- a/qpms/translations.h
+++ b/qpms/translations.h
@@ -191,6 +191,18 @@ int qpms_trans_calculator_get_AB_arrays_e32_both_points_and_shift(const qpms_tra
 		const cart2_t beta,
 		const cart2_t particle_shift
 		);
+
+int qpms_trans_calculator_get_AB_arrays_e32(const qpms_trans_calculator *c,
+		complex double *Adest, double *Aerr,
+		complex double *Bdest, double *Berr,
+		const ptrdiff_t deststride, const ptrdiff_t srcstride,
+		const double eta, const double k,
+		cart2_t b1, cart2_t b2,
+		const cart2_t beta,
+		const cart2_t particle_shift,
+		double maxR, double maxK
+		);
+
 #endif //LATTICESUMS32
 
 #ifdef LATTICESUMS31