Force 32bit alignment for vectorized operations, fixes 32bit build

Signed-off-by: falkTX <falktx@falktx.com>
2022-03-14 23:19:16 +00:00 · 2022-03-14 23:19:16 +00:00 · 2dc12fb1ca
commit 2dc12fb1ca
parent 6d9633b7fb
8 changed files with 656 additions and 5 deletions
--- a/include/dsp/fir.hpp
+++ b/include/dsp/fir.hpp
@ -0,0 +1,164 @@
+/*
+ * DISTRHO Cardinal Plugin
+ * Copyright (C) 2021-2022 Filipe Coelho <falktx@falktx.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 3 of
+ * the License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * For a full copy of the GNU General Public License see the LICENSE file.
+ */
+
+/**
+ * This file is an edited version of VCVRack's dsp/fir.hpp
+ * Copyright (C) 2016-2021 VCV.
+ *
+ * This program is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 3 of
+ * the License, or (at your option) any later version.
+ */
+
+#pragma once
+
+#include <pffft.h>
+
+#include <dsp/common.hpp>
+
+
+namespace rack {
+namespace dsp {
+
+
+/** Performs a direct sum convolution */
+inline float convolveNaive(const float* in, const float* kernel, int len) {
+	float y = 0.f;
+	for (int i = 0; i < len; i++) {
+		y += in[len - 1 - i] * kernel[i];
+	}
+	return y;
+}
+
+/** Computes the impulse response of a boxcar lowpass filter */
+inline void boxcarLowpassIR(float* out, int len, float cutoff = 0.5f) {
+	for (int i = 0; i < len; i++) {
+		float t = i - (len - 1) / 2.f;
+		out[i] = 2 * cutoff * sinc(2 * cutoff * t);
+	}
+}
+
+
+struct RealTimeConvolver {
+	// `kernelBlocks` number of contiguous FFT blocks of size `blockSize`
+	// indexed by [i * blockSize*2 + j]
+	float* kernelFfts = NULL;
+	float* inputFfts = NULL;
+	float* outputTail = NULL;
+	float* tmpBlock = NULL;
+	size_t blockSize;
+	size_t kernelBlocks = 0;
+	size_t inputPos = 0;
+	PFFFT_Setup* pffft;
+
+	/** `blockSize` is the size of each FFT block. It should be >=32 and a power of 2. */
+	RealTimeConvolver(size_t blockSize) {
+		this->blockSize = blockSize;
+		pffft = pffft_new_setup(blockSize * 2, PFFFT_REAL);
+		outputTail = (float*) pffft_aligned_malloc(sizeof(float) * blockSize);
+		std::memset(outputTail, 0, blockSize * sizeof(float));
+		tmpBlock = (float*) pffft_aligned_malloc(sizeof(float) * blockSize * 2);
+		std::memset(tmpBlock, 0, blockSize * 2 * sizeof(float));
+	}
+
+	~RealTimeConvolver() {
+		setKernel(NULL, 0);
+		pffft_aligned_free(outputTail);
+		pffft_aligned_free(tmpBlock);
+		pffft_destroy_setup(pffft);
+	}
+
+	void setKernel(const float* kernel, size_t length) {
+		// Clear existing kernel
+		if (kernelFfts) {
+			pffft_aligned_free(kernelFfts);
+			kernelFfts = NULL;
+		}
+		if (inputFfts) {
+			pffft_aligned_free(inputFfts);
+			inputFfts = NULL;
+		}
+		kernelBlocks = 0;
+		inputPos = 0;
+
+		if (kernel && length > 0) {
+			// Round up to the nearest factor of `blockSize`
+			kernelBlocks = (length - 1) / blockSize + 1;
+
+			// Allocate blocks
+			kernelFfts = (float*) pffft_aligned_malloc(sizeof(float) * blockSize * 2 * kernelBlocks);
+			inputFfts = (float*) pffft_aligned_malloc(sizeof(float) * blockSize * 2 * kernelBlocks);
+			std::memset(inputFfts, 0, sizeof(float) * blockSize * 2 * kernelBlocks);
+
+			for (size_t i = 0; i < kernelBlocks; i++) {
+				// Pad each block with zeros
+				std::memset(tmpBlock, 0, sizeof(float) * blockSize * 2);
+				size_t len = std::min((int) blockSize, (int)(length - i * blockSize));
+				std::memcpy(tmpBlock, &kernel[i * blockSize], sizeof(float)*len);
+				// Compute fft
+				pffft_transform(pffft, tmpBlock, &kernelFfts[blockSize * 2 * i], NULL, PFFFT_FORWARD);
+			}
+		}
+	}
+
+	/** Applies reverb to input
+	input and output must be of size `blockSize`
+	*/
+	void processBlock(const float* input, float* output) {
+		if (kernelBlocks == 0) {
+			std::memset(output, 0, sizeof(float) * blockSize);
+			return;
+		}
+
+		// Step input position
+		inputPos = (inputPos + 1) % kernelBlocks;
+		// Pad block with zeros
+		std::memset(tmpBlock, 0, sizeof(float) * blockSize * 2);
+		std::memcpy(tmpBlock, input, sizeof(float) * blockSize);
+		// Compute input fft
+		pffft_transform(pffft, tmpBlock, &inputFfts[blockSize * 2 * inputPos], NULL, PFFFT_FORWARD);
+		// Create output fft
+		std::memset(tmpBlock, 0, sizeof(float) * blockSize * 2);
+		// convolve input fft by kernel fft
+		// Note: This is the CPU bottleneck loop
+		for (size_t i = 0; i < kernelBlocks; i++) {
+			size_t pos = (inputPos - i + kernelBlocks) % kernelBlocks;
+			pffft_zconvolve_accumulate(pffft, &kernelFfts[blockSize * 2 * i], &inputFfts[blockSize * 2 * pos], tmpBlock, 1.f);
+		}
+		// Compute output
+		pffft_transform(pffft, tmpBlock, tmpBlock, NULL, PFFFT_BACKWARD);
+		// Add block tail from last output block
+		for (size_t i = 0; i < blockSize; i++) {
+			tmpBlock[i] += outputTail[i];
+		}
+		// Copy output block to output
+		float scale = 1.f / (blockSize * 2);
+		for (size_t i = 0; i < blockSize; i++) {
+			// Scale based on FFT
+			output[i] = tmpBlock[i] * scale;
+		}
+		// Set tail
+		for (size_t i = 0; i < blockSize; i++) {
+			outputTail[i] = tmpBlock[i + blockSize];
+		}
+	}
+};
+
+
+} // namespace dsp
+} // namespace rack