diff --git a/filter.c b/filter.c
index 80ea31c6..83e7a8d1 100644
--- a/filter.c
+++ b/filter.c
@@ -131,6 +131,7 @@ struct filter_in *create_filter_input(int const L,int const M, enum filtertype c
     r = fftwf_import_wisdom_from_filename(Wisdom_file);
     fprintf(stdout,"fftwf_import_wisdom_from_filename(%s) %s\n",Wisdom_file,r == 1 ? "succeeded" : "failed");
     fftwf_set_timelimit(Fftw_plan_timelimit);
+
     // Start FFT worker thread(s) if not already running
     for(int i=0;i < Nthreads;i++){
       if(FFT.thread[i] == (pthread_t)0)
@@ -305,6 +306,11 @@ int execute_filter_input(struct filter_in * const f){
   job->completion_cond = &f->filter_cond;
 
   // Set up the job and next input buffer
+  // We're assuming that the time-domain pointers we're passing to the FFT are always aligned the same
+  // as we increment the FFT pointer by f->ilen (L) modulo the mirror buffer size.
+  // They seem to be as long as ilen (L) has several factors of 2. For the real->complex transform,
+  // each element is 4 bytes long, so if L is divisible by 8 then the pointers will be aligned to 64 bytes,
+  // the usual size of a cache line. For complex->complex transforms, L has to be divisible by 4.
   switch(f->in_type){
   default:
   case CROSS_CONJ: