diff --git a/include/lc3_private.h b/include/lc3_private.h
index 3d9ec0f..467d028 100644
--- a/include/lc3_private.h
+++ b/include/lc3_private.h
@@ -108,12 +108,12 @@ struct lc3_encoder {
     lc3_spec_analysis_t spec;
 
     int16_t *xt;
-    float *xs, *xf, s[0];
+    float *xs, *xd, s[0];
 };
 
 #define LC3_ENCODER_BUFFER_COUNT(dt_us, sr_hz) \
     ( ( __LC3_NS(dt_us, sr_hz) + __LC3_NT(sr_hz) ) / 2 + \
-      2*__LC3_NS(dt_us, sr_hz) + __LC3_ND(dt_us, sr_hz) )
+        __LC3_NS(dt_us, sr_hz) + __LC3_ND(dt_us, sr_hz) )
 
 #define LC3_ENCODER_MEM_T(dt_us, sr_hz) \
     struct { \
diff --git a/src/common.h b/src/common.h
index c9160ca..2a4e0cb 100644
--- a/src/common.h
+++ b/src/common.h
@@ -83,6 +83,7 @@
 #define LC3_ND(dt, sr) \
     ( (dt) == LC3_DT_7M5 ? 23 * LC3_NS(dt, sr) / 30 \
                          :  5 * LC3_NS(dt, sr) /  8 )
+
 #define LC3_NE(dt, sr) \
     ( 20 * (3 + (dt)) * (1 + (sr)) )
 
diff --git a/src/lc3.c b/src/lc3.c
index ba45b96..525ac01 100644
--- a/src/lc3.c
+++ b/src/lc3.c
@@ -205,12 +205,12 @@ static void analyze(struct lc3_encoder *encoder,
     enum lc3_srate sr = encoder->sr;
     enum lc3_srate sr_pcm = encoder->sr_pcm;
     int ns = LC3_NS(dt, sr_pcm);
-    int nd = LC3_ND(dt, sr_pcm);
     int nt = LC3_NT(sr_pcm);
 
     int16_t *xt = encoder->xt;
     float *xs = encoder->xs;
-    float *xf = encoder->xf;
+    float *xd = encoder->xd;
+    float *xf = xs;
 
     /* --- Temporal --- */
 
@@ -225,8 +225,7 @@ static void analyze(struct lc3_encoder *encoder,
 
     float e[LC3_NUM_BANDS];
 
-    lc3_mdct_forward(dt, sr_pcm, sr, xs, xf);
-    memmove(xs - nd, xs + (ns-nd), nd * sizeof(*xs));
+    lc3_mdct_forward(dt, sr_pcm, sr, xs, xd, xf);
 
     bool nn_flag = lc3_energy_compute(dt, sr, xf, e);
     if (nn_flag)
@@ -256,7 +255,7 @@ static void encode(struct lc3_encoder *encoder,
     enum lc3_dt dt = encoder->dt;
     enum lc3_srate sr = encoder->sr;
     enum lc3_bandwidth bw = side->bw;
-    float *xf = encoder->xf;
+    float *xf = encoder->xs;
 
     lc3_bits_t bits;
 
@@ -312,7 +311,6 @@ struct lc3_encoder *lc3_setup_encoder(
 
     struct lc3_encoder *encoder = mem;
     int ns = LC3_NS(dt, sr_pcm);
-    int nd = LC3_ND(dt, sr_pcm);
     int nt = LC3_NT(sr_pcm);
 
     *encoder = (struct lc3_encoder){
@@ -320,8 +318,8 @@ struct lc3_encoder *lc3_setup_encoder(
         .sr_pcm = sr_pcm,
 
         .xt = (int16_t *)encoder->s + nt,
-        .xs = encoder->s + (nt+ns)/2 + nd,
-        .xf = encoder->s + (nt+ns)/2 + nd+ns,
+        .xs = encoder->s + (nt+ns)/2,
+        .xd = encoder->s + (nt+ns)/2 + ns,
     };
 
     memset(encoder->s, 0,
diff --git a/src/mdct.c b/src/mdct.c
index 2d651a2..29bc161 100644
--- a/src/mdct.c
+++ b/src/mdct.c
@@ -193,39 +193,35 @@ static struct lc3_complex *fft(const struct lc3_complex *x, int n,
 /**
  * Windowing of samples before MDCT
  * dt, sr          Duration and samplerate (size of the transform)
- * x               [-nd..-1] Previous, [0..ns-1] Current samples
- * y               Output `ns` windowed samples
- *
- * The number of previous samples `nd` accessed on `x` is :
- *   nd: `ns` * 23/30 for 7.5ms frame duration
- *   nd: `ns` *  5/ 8 for  10ms frame duration
+ * x, y            Input current and delayed samples
+ * y, d            Output windowed samples, and delayed ones
  */
-static void mdct_window(
-    enum lc3_dt dt, enum lc3_srate sr, const float *x, float *y)
+static void mdct_window(enum lc3_dt dt, enum lc3_srate sr,
+    const float *x, float *d, float *y)
 {
     int ns = LC3_NS(dt, sr), nd = LC3_ND(dt, sr);
 
     const float *w0 = lc3_mdct_win[dt][sr], *w1 = w0 + ns;
     const float *w2 = w1, *w3 = w2 + nd;
 
-    const float *x0 = x - nd, *x1 = x0 + ns;
-    const float *x2 = x1, *x3 = x2 + nd;
-
+    const float *x0 = x + ns-nd, *x1 = x0;
     float *y0 = y + ns/2, *y1 = y0;
+    float *d0 = d, *d1 = d + nd;
 
-    while (x0 < x1) {
-        *(--y0) = *(x0++) * *(w0++) - *(--x1) * *(--w1);
-        *(--y0) = *(x0++) * *(w0++) - *(--x1) * *(--w1);
+    while (x1 > x) {
+        *(--y0) = *d0 * *(w0++) - *(--x1) * *(--w1);
+        *(y1++) = (*(d0++) = *(x0++)) * *(w2++);
+
+        *(--y0) = *d0 * *(w0++) - *(--x1) * *(--w1);
+        *(y1++) = (*(d0++) = *(x0++)) * *(w2++);
     }
 
-    for (const float *xe = x2 + ns-nd; x2 < xe; ) {
-        *(y1++) = *(x2++) * *(w2++);
-        *(y1++) = *(x2++) * *(w2++);
-    }
+    for (x1 += ns; x0 < x1; ) {
+        *(--y0) = *d0 * *(w0++) - *(--d1) * *(--w1);
+        *(y1++) = (*(d0++) = *(x0++)) * *(w2++) + (*d1 = *(--x1)) * *(--w3);
 
-    while (x2 < x3) {
-        *(y1++) = *(x2++) * *(w2++) + *(--x3) * *(--w3);
-        *(y1++) = *(x2++) * *(w2++) + *(--x3) * *(--w3);
+        *(--y0) = *d0 * *(w0++) - *(--d1) * *(--w1);
+        *(y1++) = (*(d0++) = *(x0++)) * *(w2++) + (*d1 = *(--x1)) * *(--w3);
     }
 }
 
@@ -409,16 +405,17 @@ static void imdct_window(enum lc3_dt dt, enum lc3_srate sr,
  * Forward MDCT transformation
  */
 void lc3_mdct_forward(enum lc3_dt dt, enum lc3_srate sr,
-    enum lc3_srate sr_dst, const float *x, float *y)
+    enum lc3_srate sr_dst, const float *x, float *d, float *y)
 {
     const struct lc3_mdct_rot_def *rot = lc3_mdct_rot[dt][sr];
     int nf = LC3_NS(dt, sr_dst);
     int ns = LC3_NS(dt, sr);
 
-    union { float *f; struct lc3_complex *z; } u = { .f = y };
-    struct lc3_complex z[ns/2];
+    struct lc3_complex buffer[ns/2];
+    struct lc3_complex *z = (struct lc3_complex *)y;
+    union { float *f; struct lc3_complex *z; } u = { .z = buffer };
 
-    mdct_window(dt, sr, x, u.f);
+    mdct_window(dt, sr, x, d, u.f);
 
     mdct_pre_fft(rot, u.f, u.z);
     u.z = fft(u.z, ns/2, u.z, z);
diff --git a/src/mdct.h b/src/mdct.h
index fe8dec7..03ae801 100644
--- a/src/mdct.h
+++ b/src/mdct.h
@@ -33,15 +33,13 @@
  * Forward MDCT transformation
  * dt, sr          Duration and samplerate (size of the transform)
  * sr_dst          Samplerate destination, scale transforam accordingly
- * x               [-nd..-1] Previous, [0..ns-1] Current samples
- * y               Output `ns` frequency coefficients
+ * x, d            Temporal samples and delayed buffer
+ * y, d            Output `ns` coefficients and `nd` delayed samples
  *
- * The number of previous samples `nd` accessed on `x` is :
- *   nd: `ns` * 23/30 for 7.5ms frame duration
- *   nd: `ns` *  5/ 8 for  10ms frame duration
+ * `x` and `y` can be the same buffer
  */
 void lc3_mdct_forward(enum lc3_dt dt, enum lc3_srate sr,
-    enum lc3_srate sr_dst, const float *x, float *y);
+    enum lc3_srate sr_dst, const float *x, float *d, float *y);
 
 /**
  * Inverse MDCT transformation
diff --git a/test/ctypes.h b/test/ctypes.h
index 1c1e56a..97a3add 100644
--- a/test/ctypes.h
+++ b/test/ctypes.h
@@ -729,10 +729,10 @@ static PyObject *from_encoder(PyObject *obj, const struct lc3_encoder *enc)
         new_1d_copy(NPY_INT16, nt+ns, enc->xt-nt));
 
     PyDict_SetItemString(obj, "xs",
-        new_1d_copy(NPY_FLOAT, ns+nd, enc->xs-nd));
+        new_1d_copy(NPY_FLOAT, ns, enc->xs));
 
-    PyDict_SetItemString(obj, "xf",
-        new_1d_copy(NPY_FLOAT, ns, enc->xf));
+    PyDict_SetItemString(obj, "xd",
+        new_1d_copy(NPY_FLOAT, nd, enc->xd));
 
     return obj;
 }
@@ -741,7 +741,7 @@ __attribute__((unused))
 static PyObject *to_encoder(PyObject *obj, struct lc3_encoder *enc)
 {
     unsigned dt, sr, sr_pcm;
-    PyObject *xt_obj, *xs_obj, *xf_obj;
+    PyObject *xt_obj, *xs_obj, *xd_obj;
 
     CTYPES_CHECK("encoder", obj && PyDict_Check(obj));
 
@@ -776,12 +776,12 @@ static PyObject *to_encoder(PyObject *obj, struct lc3_encoder *enc)
     PyDict_SetItemString(obj, "xt", xt_obj);
 
     CTYPES_CHECK("encoder.xs", xs_obj = to_1d_copy(
-        PyDict_GetItemString(obj, "xs"), NPY_FLOAT, enc->xs-nd, ns+nd));
+        PyDict_GetItemString(obj, "xs"), NPY_FLOAT, enc->xs, ns));
     PyDict_SetItemString(obj, "xs", xs_obj);
 
-    CTYPES_CHECK("encoder.xf", xf_obj = to_1d_copy(
-        PyDict_GetItemString(obj, "xf"), NPY_FLOAT, enc->xf, ns));
-    PyDict_SetItemString(obj, "xf", xf_obj);
+    CTYPES_CHECK("encoder.xd", xd_obj = to_1d_copy(
+        PyDict_GetItemString(obj, "xd"), NPY_FLOAT, enc->xd, nd));
+    PyDict_SetItemString(obj, "xd", xd_obj);
 
     return obj;
 }
diff --git a/test/mdct.py b/test/mdct.py
index d13a329..aafba3f 100644
--- a/test/mdct.py
+++ b/test/mdct.py
@@ -99,11 +99,15 @@ def check_forward_unit(rng, dt, sr):
 
     x = (2 * rng.random(ns)) - 1
 
-    mdct = MdctForward(dt, sr)
-    y = [ mdct.run(x), mdct.run(x) ]
+    y   = [ None ] * 2
+    y_c = [ None ] * 2
 
-    y_c = [ lc3.mdct_forward(dt, sr, np.append(np.zeros(nd), x)),
-            lc3.mdct_forward(dt, sr, np.append(x[-nd:], x))      ]
+    mdct = MdctForward(dt, sr)
+    y[0] = mdct.run(x)
+    y[1] = mdct.run(x)
+
+    (y_c[0], d_c) = lc3.mdct_forward(dt, sr, x, np.zeros(nd))
+    y_c[1] = lc3.mdct_forward(dt, sr, x, d_c)[0]
 
     ok = ok and np.amax(np.abs(y[0] - y_c[0])) < 1e-5
     ok = ok and np.amax(np.abs(y[1] - y_c[1])) < 1e-5
@@ -118,12 +122,10 @@ def check_forward_appendix_c(dt):
     nd = T.ND[dt][sr]
     ok = True
 
-    y  = lc3.mdct_forward(dt, sr,
-            np.append(np.zeros(nd), C.X_PCM[dt][0]))
+    (y, d) = lc3.mdct_forward(dt, sr, C.X_PCM[dt][0], np.zeros(nd))
     ok = ok and np.amax(np.abs(y - C.X[dt][0])) < 1e-1
 
-    y  = lc3.mdct_forward(dt, sr,
-            np.append(C.X_PCM[dt][0][-nd:], C.X_PCM[dt][1]))
+    (y, d) = lc3.mdct_forward(dt, sr, C.X_PCM[dt][1], d)
     ok = ok and np.amax(np.abs(y - C.X[dt][1])) < 1e-1
 
     return ok
diff --git a/test/mdct_py.c b/test/mdct_py.c
index 4876976..3479503 100644
--- a/test/mdct_py.c
+++ b/test/mdct_py.c
@@ -25,12 +25,12 @@
 
 static PyObject *mdct_forward_py(PyObject *m, PyObject *args)
 {
-    PyObject *x_obj, *y_obj;
+    PyObject *x_obj, *xd_obj, *y_obj, *d_obj;
     enum lc3_dt dt;
     enum lc3_srate sr;
-    float *x, *y;
+    float *x, *xd, *y, *d;
 
-    if (!PyArg_ParseTuple(args, "iiO", &dt, &sr, &x_obj))
+    if (!PyArg_ParseTuple(args, "iiOO", &dt, &sr, &x_obj, &xd_obj))
         return NULL;
 
     CTYPES_CHECK("dt", (unsigned)dt < LC3_NUM_DT);
@@ -38,12 +38,16 @@ static PyObject *mdct_forward_py(PyObject *m, PyObject *args)
 
     int ns = LC3_NS(dt, sr), nd = LC3_ND(dt, sr);
 
-    CTYPES_CHECK("x", to_1d_ptr(x_obj, NPY_FLOAT, nd+ns, &x));
+    CTYPES_CHECK("x", to_1d_ptr(x_obj, NPY_FLOAT, ns, &x));
+    CTYPES_CHECK("xd", to_1d_ptr(xd_obj, NPY_FLOAT, nd, &xd));
+    d_obj = new_1d_ptr(NPY_FLOAT, nd, &d);
     y_obj = new_1d_ptr(NPY_FLOAT, ns, &y);
 
-    lc3_mdct_forward(dt, sr, sr, x+nd, y);
+    memcpy(d, xd, nd * sizeof(float));
 
-    return Py_BuildValue("N", y_obj);
+    lc3_mdct_forward(dt, sr, sr, x, d, y);
+
+    return Py_BuildValue("NN", y_obj, d_obj);
 }
 
 static PyObject *mdct_inverse_py(PyObject *m, PyObject *args)