_DIGITAL SPEECH COMPRESSION_ by Jutta Degener Listing One /* params.h -- common definitions for the speech processing listings. */ #define P_MAX 8 /* order p of LPC analysis, typically 8..14 */ #define WINDOW 160 /* window size for short-term processing */ double levinson_durbin(double const * ac, double * ref, double * lpc); double schur(double const * ac, double * ref); void autocorrelation(int n, double const * x, int lag, double * ac); Listing Two /* LPC- and Reflection Coefficients * The next two functions calculate linear prediction coefficients * and/or the related reflection coefficients from the first P_MAX+1 * values of the autocorrelation function. */ #include "params.h" /* for P_MAX */ /* The Levinson-Durbin algorithm was invented by N. Levinson in 1947 * and modified by J. Durbin in 1959. */ double /* returns minimum mean square error */ levinson_durbin( double const * ac, /* in: [0...p] autocorrelation values */ double * ref, /* out: [0...p-1] reflection coefficients */ double * lpc) /* [0...p-1] LPC coefficients */ { int i, j; double r, error = ac[0]; if (ac[0] == 0) { for (i = 0; i < P_MAX; i++) ref[i] = 0; return 0; } for (i = 0; i < P_MAX; i++) { /* Sum up this iteration's reflection coefficient. */ r = -ac[i + 1]; for (j = 0; j < i; j++) r -= lpc[j] * ac[i - j]; ref[i] = r /= error; /* Update LPC coefficients and total error. */ lpc[i] = r; for (j = 0; j < i / 2; j++) { double tmp = lpc[j]; lpc[j] = r * lpc[i - 1 - j]; lpc[i - 1 - j] += r * tmp; } if (i % 2) lpc[j] += lpc[j] * r; error *= 1 - r * r; } return error; } /* I. Schur's recursion from 1917 is related to the Levinson-Durbin method, * but faster on parallel architectures; where Levinson-Durbin would take time * proportional to p * log(p), Schur only requires time proportional to p. The * GSM coder uses an integer version of the Schur recursion. */ double /* returns the minimum mean square error */ schur( double const * ac, /* in: [0...p] autocorrelation values */ double * ref) /* out: [0...p-1] reflection coefficients */ { int i, m; double r, error = ac[0], G[2][P_MAX]; if (ac[0] == 0.0) { for (i = 0; i < P_MAX; i++) ref[i] = 0; return 0; } /* Initialize the rows of the generator matrix G to ac[1...p]. */ for (i = 0; i < P_MAX; i++) G[0][i] = G[1][i] = ac[i + 1]; for (i = 0;;) { /* Calculate this iteration's reflection coefficient and error. */ ref[i] = r = -G[1][0] / error; error += G[1][0] * r; if (++i >= P_MAX) return error; /* Update the generator matrix. Unlike Levinson-Durbin's summing of * reflection coefficients, this loop could be executed in parallel * by p processors in constant time. */ for (m = 0; m < P_MAX - i; m++) { G[1][m] = G[1][m + 1] + r * G[0][m]; G[0][m] = G[1][m + 1] * r + G[0][m]; } } } /* Compute the autocorrelation * ,--, * ac(l) = > x(i) * x(i-l) for all i * `--' * for lags l between 0 and lag-1, and x(i) == 0 for i < 0 or i >= n */ void autocorrelation( int n, double const * x, /* in: [0...n-1] samples x */ int lag, double * ac) /* out: [0...lag-1] autocorrelation */ { double d; int i; while (lag--) { for (i = lag, d = 0; i < n; i++) d += x[i] * x[i-lag]; ac[lag] = d; } } Listing Three /* Short-Term Linear Prediction * To show which parts of speech are picked up by short-term linear * prediction, this program replaces everything but the short-term * predictable parts of its input with a fixed periodic pulse. (You * may want to try other excitations.) The result lacks pitch * information, but is still discernible. */ #include #include #include #include "params.h" /* See Listing One; #defines WINDOW and P_MAX */ /* Default period for a pulse that will feed the short-term processing. The * length of the period is the inverse of the pitch of the program's * "robot voice"; the smaller the period, the higher the voice. */ #define PERIOD 100 /* human speech: between 16 and 160 */ /* The short-term synthesis and analysis functions below filter and inverse- * filter their input according to reflection coefficients from Listing Two. */ static void short_term_analysis( double const * ref, /* in: [0...p-1] reflection coefficients */ int n, /* # of samples */ double const * in, /* [0...n-1] input samples */ double * out) /* out: [0...n-1] short-term residual */ { double sav, s, ui; int i; static double u[P_MAX]; while (n--) { sav = s = *in++; for (i = 0; i < P_MAX; i++) { ui = u[i]; u[i] = sav; sav = ui + ref[i] * s; s = s + ref[i] * ui; } *out++ = s; } } static void short_term_synthesis( double const * ref, /* in: [0...p-1] reflection coefficients */ int n, /* # of samples */ double const * in, /* [0...n-1] residual input */ double * out) /* out: [0...n-1] short-term signal */ { double s; int i; static double u[P_MAX+1]; while (n--) { s = *in++; for (i = P_MAX; i--;) { s -= ref[i] * u[i]; u[i+1] = ref[i] * s + u[i]; } *out++ = u[0] = s; } } /* This fake long-term processing section implements the "robotic" voice: * it replaces the short-term residual by a fixed periodic pulse. */ static void long_term(double * d) { int i; static int r; for (i = 0; i < WINDOW; i++) d[i] = 0; for (; r < WINDOW; r += PERIOD) d[r] = 10000.0; r -= WINDOW; } /* Read signed short PCM values from stdin, process them as double, * and write the result back as shorts. */ int main(int argc, char ** argv) { short s[WINDOW]; double d[WINDOW]; int i, n; double ac[P_MAX + 1], ref[P_MAX]; while ((n = fread(s, sizeof(*s), WINDOW, stdin)) > 0) { for (i = 0; i < n; i++) d[i] = s[i]; for (; i < WINDOW; i++) d[i] = 0; /* Split input into short-term predictable part and residual. */ autocorrelation(WINDOW, d, P_MAX + 1, ac); schur(ac, ref); short_term_analysis(ref, WINDOW, d, d); /* Process that residual, and synthesize the speech again. */ long_term(d); short_term_synthesis(ref, WINDOW, d, d); /* Convert back to short, and write. */ for (i = 0; i < n; i++) s[i] = d[i] > SHRT_MAX ? SHRT_MAX : d[i] < SHRT_MIN ? SHRT_MIN : d[i]; if (fwrite(s, sizeof(*s), n, stdout) != n) { fprintf(stderr, "%s: write failed\n", *argv); exit(1); } if (feof(stdin)) break; } if (ferror(stdin)) { fprintf(stderr,"%s: read failed\n", *argv); exit(1); } return 0; } Listing Four /* Long-Term Prediction * Here's a replacement for the long_term() function of Listing Three: * A "voice" that is based on the two long-term prediction parameters, * the gain and the lag. By transmitting very little information, * the final output can be made to sound much more natural. */ #include #include #include #include "params.h" /* see Listing One; #defines WINDOW */ #define SUBWIN 40 /* LTP window size, WINDOW % SUBWIN == 0 */ /* Compute n * ,--, * cc(l) = > x(i) * y(i-l) * `--' * i=0 * for lags l from 0 to lag-1. */ static void crosscorrelation( int n, /* in: # of sample values */ double const * x, /* [0...n-1] samples x */ double const * y, /* [-lag+1...n-1] samples y */ int lag, /* maximum lag+1 */ double * c) /* out: [0...lag-1] cc values */ { while (lag--) { int i; double d = 0; for (i = 0; i < n; i++) d += x[i] * y[i - lag]; c[lag] = d; } } /* Calculate long-term prediction lag and gain. */ static void long_term_parameters( double const * d, /* in: [0.....SUBWIN-1] samples */ double const * prev, /* [-3*SUBWIN+1...0] past signal */ int * lag_out, /* out: LTP lag */ double * gain_out) /* LTP gain */ { int i, lag; double cc[2 * SUBWIN], maxcc, energy; /* Find the maximum correlation with lags SUBWIN...3*SUBWIN-1 * between this frame and the previous ones. */ crosscorrelation(SUBWIN, d, prev - SUBWIN, 2 * SUBWIN, cc); maxcc = cc[lag = 0]; for (i = 1; i < 2 * SUBWIN; i++) if (cc[i] > maxcc) maxcc = cc[lag = i]; *lag_out = lag + SUBWIN; /* Calculate the gain from the maximum correlation and * the energy of the selected SUBWIN past samples. */ autocorrelation(SUBWIN, prev - *lag_out, 1, &energy); *gain_out = energy ? maxcc / energy : 1.0; } /* The "reduce" function simulates the effect of quantizing, * encoding, transmitting, decoding, and inverse quantizing the * residual signal by losing most of its information. For this * experiment, we simply throw away everything but the first sample value. */ static void reduce(double *d) { int i; for (i = 1; i < SUBWIN; i++) d[i] = 0; } void long_term(double * d) { static double prev[3*SUBWIN]; double gain; int n, i, lag; for (n = 0; n < WINDOW/SUBWIN; n++, d += SUBWIN) { long_term_parameters(d, prev + 3*SUBWIN, &lag, &gain); /* Remove the long-term predictable parts. */ for (i = 0; i < SUBWIN; i++) d[i] -= gain * prev[3 * SUBWIN - lag + i]; /* Simulate encoding and transmission of the residual. */ reduce(d); /* Estimate the signal from predicted parameters and residual. */ for (i = 0; i < SUBWIN; i++) d[i] += gain * prev[3*SUBWIN - lag + i]; /* Shift the SUBWIN new estimates into the past. */ for (i = 0; i < 2*SUBWIN; i++) prev[i] = prev[i + SUBWIN]; for (i = 0; i < SUBWIN; i++) prev[2*SUBWIN + i] = d[i]; } }