/********************************************************************** Audacity: A Digital Audio Editor EffectEqualization.cpp Andrew Hallendorff *******************************************************************//** \file Equalization48x.cpp \brief Fast SSE based implementation of equalization. *//****************************************************************/ #include "../Audacity.h" #include "../Project.h" #ifdef EXPERIMENTAL_EQ_SSE_THREADED #include "Equalization.h" #include "../WaveTrack.h" #include "float_cast.h" #include #include #include #include #if wxUSE_TOOLTIPS #include #endif #include #include #include #include "Equalization48x.h" #include "../RealFFTf.h" #include "../RealFFTf48x.h" #ifndef USE_SSE2 #define USE_SSE2 #endif #include #include #include #include #include #ifdef _WIN32 // Windows #include #define cpuid __cpuid #else // GCC Inline Assembly void cpuid(int CPUInfo[4],int InfoType){ __asm__ __volatile__ ( "cpuid": "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) : "a" (InfoType) ); } #endif bool sMathCapsInitialized = false; MathCaps sMathCaps; // dirty switcher int sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; void EffectEqualization48x::SetMathPath(int mathPath) { sMathPath=mathPath; }; int EffectEqualization48x::GetMathPath() { return sMathPath; }; void EffectEqualization48x::AddMathPathOption(int mathPath) { sMathPath|=mathPath; }; void EffectEqualization48x::RemoveMathPathOption(int mathPath) { sMathPath&=~mathPath; }; MathCaps *EffectEqualization48x::GetMathCaps() { if(!sMathCapsInitialized) { sMathCapsInitialized=true; sMathCaps.x64 = false; sMathCaps.MMX = false; sMathCaps.SSE = false; sMathCaps.SSE2 = false; sMathCaps.SSE3 = false; sMathCaps.SSSE3 = false; sMathCaps.SSE41 = false; sMathCaps.SSE42 = false; sMathCaps.SSE4a = false; sMathCaps.AVX = false; sMathCaps.XOP = false; sMathCaps.FMA3 = false; sMathCaps.FMA4 = false; int info[4]; cpuid(info, 0); int nIds = info[0]; cpuid(info, 0x80000000); int nExIds = info[0]; // Detect Instruction Set if (nIds >= 1){ cpuid(info,0x00000001); sMathCaps.MMX = (info[3] & ((int)1 << 23)) != 0; sMathCaps.SSE = (info[3] & ((int)1 << 25)) != 0; sMathCaps.SSE2 = (info[3] & ((int)1 << 26)) != 0; sMathCaps.SSE3 = (info[2] & ((int)1 << 0)) != 0; sMathCaps.SSSE3 = (info[2] & ((int)1 << 9)) != 0; sMathCaps.SSE41 = (info[2] & ((int)1 << 19)) != 0; sMathCaps.SSE42 = (info[2] & ((int)1 << 20)) != 0; sMathCaps.AVX = (info[2] & ((int)1 << 28)) != 0; sMathCaps.FMA3 = (info[2] & ((int)1 << 12)) != 0; } if (nExIds >= 0x80000001){ cpuid(info,0x80000001); sMathCaps.x64 = (info[3] & ((int)1 << 29)) != 0; sMathCaps.SSE4a = (info[2] & ((int)1 << 6)) != 0; sMathCaps.FMA4 = (info[2] & ((int)1 << 16)) != 0; sMathCaps.XOP = (info[2] & ((int)1 << 11)) != 0; } if(sMathCaps.SSE) sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; // we are starting on. } return &sMathCaps; }; void * malloc_simd(const size_t size) { #if defined WIN32 // WIN32 return _aligned_malloc(size, 16); #elif defined __linux__ // Linux return memalign (16, size); #elif defined __MACH__ // Mac OS X return malloc(size); #else // other (use valloc for page-aligned memory) return valloc(size); #endif } void free_simd(void* mem) { #if defined WIN32 // WIN32 _aligned_free(mem); #else free(mem); #endif } EffectEqualization48x::EffectEqualization48x(): mThreadCount(0),mFilterSize(0),mWindowSize(0),mBlockSize(0),mWorkerDataCount(0),mBlocksPerBuffer(20), mScratchBufferSize(0),mSubBufferSize(0),mBigBuffer(NULL),mBufferInfo(NULL),mEQWorkers(0),mThreaded(false), mBenching(false),mBufferCount(0) { } EffectEqualization48x::~EffectEqualization48x() { } bool EffectEqualization48x::AllocateBuffersWorkers(int nThreads) { if(mBigBuffer) FreeBuffersWorkers(); mFilterSize=(mEffectEqualization->mM-1)&(~15); // 4000 !!! Filter MUST BE QUAD WORD ALIGNED !!!! mWindowSize=mEffectEqualization->windowSize; mBlockSize=mWindowSize-mFilterSize; // 12,384 mThreaded = (nThreads > 0 ); if(mThreaded) { mThreadCount=wxThread::GetCPUCount(); mWorkerDataCount=mThreadCount+2; // 2 extra slots (maybe double later) } else { mWorkerDataCount=1; mThreadCount=0; } #ifdef __AVX_ENABLED mBufferCount=sMathPath&MATH_FUNCTION_AVX?8:4; #else mBufferCount=4; #endif // we're skewing the data by one block to allow for 1/4 block intersections. // this will remove the disparity in data at the intersections of the runs // The nice magic allocation // megabyte - 3 windows - 4 overlaping buffers - filter // 2^20 = 1,048,576 - 3 * 2^14 (16,384) - ((4 * 20) - 3) * 12,384 - 4000 // 1,048,576 - 49,152 - 953,568 - 4000 = 41,856 (leftover) mScratchBufferSize=mWindowSize*3*sizeof(float)*mBufferCount; // 3 window size blocks of instruction size mSubBufferSize=mBlockSize*(mBufferCount*(mBlocksPerBuffer-1)); // we are going to do a full block overlap mBigBuffer=(float *)malloc_simd(sizeof(float)*(mSubBufferSize+mFilterSize+mScratchBufferSize)*mWorkerDataCount); // we run over by filtersize // fill the bufferInfo mBufferInfo = new BufferInfo[mWorkerDataCount]; for(int i=0;iProcessOne(count, track, start, len); break; } return false; } #pragma warning(pop) bool EffectEqualization48x::Process(EffectEqualization* effectEqualization) { mEffectEqualization=effectEqualization; // return TrackCompare(); // used for debugging data mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks. bool bBreakLoop = false; TableUsage(sMathPath); if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!! mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1; AllocateBuffersWorkers(sMathPath&MATH_FUNCTION_THREADED); SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks); WaveTrack *track = (WaveTrack *) iter.First(); int count = 0; while (track) { double trackStart = track->GetStartTime(); double trackEnd = track->GetEndTime(); double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0; double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1; if (t1 > t0) { sampleCount start = track->TimeToLongSamples(t0); sampleCount end = track->TimeToLongSamples(t1); sampleCount len = (sampleCount)(end - start); bBreakLoop=RunFunctionSelect(sMathPath, count, track, start, len); if( bBreakLoop ) break; } track = (WaveTrack *) iter.Next(); count++; } FreeBuffersWorkers(); mEffectEqualization->ReplaceProcessedTracks(!bBreakLoop); return !bBreakLoop; } bool EffectEqualization48x::TrackCompare() { mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks. bool bBreakLoop = false; TableUsage(sMathPath); if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!! mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1; AllocateBuffersWorkers(sMathPath&MATH_FUNCTION_THREADED); // Reset map wxArrayPtrVoid SecondIMap; wxArrayPtrVoid SecondOMap; SecondIMap.Clear(); SecondOMap.Clear(); TrackList *SecondOutputTracks = new TrackList(); //iterate over tracks of type trackType (All types if Track::All) TrackListOfKindIterator aIt(mEffectEqualization->mOutputTracksType, mEffectEqualization->mTracks); for (Track *aTrack = aIt.First(); aTrack; aTrack = aIt.Next()) { // Include selected tracks, plus sync-lock selected tracks for Track::All. if (aTrack->GetSelected() || (mEffectEqualization->mOutputTracksType == Track::All && aTrack->IsSyncLockSelected())) { Track *o = aTrack->Duplicate(); SecondOutputTracks->Add(o); SecondIMap.Add(aTrack); SecondIMap.Add(o); } } for(int i=0;i<2;i++) { SelectedTrackListOfKindIterator iter(Track::Wave, i?mEffectEqualization->mOutputTracks:SecondOutputTracks); i?sMathPath=sMathPath:sMathPath=0; WaveTrack *track = (WaveTrack *) iter.First(); int count = 0; while (track) { double trackStart = track->GetStartTime(); double trackEnd = track->GetEndTime(); double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0; double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1; if (t1 > t0) { sampleCount start = track->TimeToLongSamples(t0); sampleCount end = track->TimeToLongSamples(t1); sampleCount len = (sampleCount)(end - start); bBreakLoop=RunFunctionSelect(sMathPath, count, track, start, len); if( bBreakLoop ) break; } track = (WaveTrack *) iter.Next(); count++; } } SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks); SelectedTrackListOfKindIterator iter2(Track::Wave, SecondOutputTracks); WaveTrack *track = (WaveTrack *) iter.First(); WaveTrack *track2 = (WaveTrack *) iter2.First(); while (track) { double trackStart = track->GetStartTime(); double trackEnd = track->GetEndTime(); double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0; double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1; if (t1 > t0) { sampleCount start = track->TimeToLongSamples(t0); sampleCount end = track->TimeToLongSamples(t1); sampleCount len = (sampleCount)(end - start); DeltaTrack(track, track2, start, len); } track = (WaveTrack *) iter.Next(); track2 = (WaveTrack *) iter2.Next(); } delete SecondOutputTracks; FreeBuffersWorkers(); mEffectEqualization->ReplaceProcessedTracks(!bBreakLoop); return bBreakLoop; } bool EffectEqualization48x::DeltaTrack(WaveTrack * t, WaveTrack * t2, sampleCount start, sampleCount len) { sampleCount trackBlockSize = t->GetMaxBlockSize(); float *buffer1 = new float[trackBlockSize]; float *buffer2 = new float[trackBlockSize]; AudacityProject *p = GetActiveProject(); WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); sampleCount originalLen = len; sampleCount currentSample = start; while(len) { sampleCount curretLength=(trackBlockSize>len)?len:trackBlockSize; t->Get((samplePtr)buffer1, floatSample, currentSample, curretLength); t2->Get((samplePtr)buffer2, floatSample, currentSample, curretLength); for(int i=0;iAppend((samplePtr)buffer1, floatSample, curretLength); currentSample+=curretLength; len-=curretLength; } delete[] buffer1; delete[] buffer2; output->Flush(); len=originalLen; ProcessTail(t, output, start, len); delete output; return true; } bool EffectEqualization48x::Benchmark(EffectEqualization* effectEqualization) { mEffectEqualization=effectEqualization; mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks. bool bBreakLoop = false; TableUsage(sMathPath); if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!! mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1; AllocateBuffersWorkers(MATH_FUNCTION_THREADED); SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks); long times[] = { 0,0,0,0,0 }; wxStopWatch timer; mBenching=true; for(int i=0;i<5 && !bBreakLoop;i++) { int localMathPath; switch(i) { case 0: localMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; if(!sMathCaps.SSE) localMathPath=-1; break; case 1: localMathPath=MATH_FUNCTION_SSE; if(!sMathCaps.SSE) localMathPath=-1; break; case 2: localMathPath=MATH_FUNCTION_SEGMENTED_CODE; break; case 3: localMathPath=MATH_FUNCTION_THREADED|MATH_FUNCTION_SEGMENTED_CODE; break; case 4: localMathPath=0; break; default: localMathPath=-1; } if(localMathPath>=0) { timer.Start(); WaveTrack *track = (WaveTrack *) iter.First(); int count = 0; while (track) { double trackStart = track->GetStartTime(); double trackEnd = track->GetEndTime(); double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0; double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1; if (t1 > t0) { sampleCount start = track->TimeToLongSamples(t0); sampleCount end = track->TimeToLongSamples(t1); sampleCount len = (sampleCount)(end - start); bBreakLoop=RunFunctionSelect( localMathPath, count, track, start, len); if( bBreakLoop ) break; } track = (WaveTrack *) iter.Next(); count++; } times[i]=timer.Time(); } } FreeBuffersWorkers(); mBenching=false; bBreakLoop=false; mEffectEqualization->ReplaceProcessedTracks(bBreakLoop); wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]); wxTimeSpan tsSSE(0, 0, 0, times[1]); wxTimeSpan tsDefaultEnhanced(0, 0, 0, times[2]); wxTimeSpan tsDefaultThreaded(0, 0, 0, times[3]); wxTimeSpan tsDefault(0, 0, 0, times[4]); wxMessageBox(wxString::Format(_("Benchmark times:\nOriginal: %s\nDefault Segmented: %s\nDefault Threaded: %s\nSSE: %s\nSSE Threaded: %s\n"),tsDefault.Format(wxT("%M:%S.%l")).c_str(), tsDefaultEnhanced.Format(wxT("%M:%S.%l")).c_str(), tsDefaultThreaded.Format(wxT("%M:%S.%l")).c_str(),tsSSE.Format(wxT("%M:%S.%l")).c_str(),tsSSEThreaded.Format(wxT("%M:%S.%l")).c_str())); return bBreakLoop; } bool EffectEqualization48x::ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len) { // double offsetT0 = t->LongSamplesToTime((sampleCount)offset); double lenT = t->LongSamplesToTime(len); // 'start' is the sample offset in 't', the passed in track // 'startT' is the equivalent time value // 'output' starts at zero double startT = t->LongSamplesToTime(start); //output has one waveclip for the total length, even though //t might have whitespace seperating multiple clips //we want to maintain the original clip structure, so //only paste the intersections of the new clip. //Find the bits of clips that need replacing std::vector > clipStartEndTimes; std::vector > clipRealStartEndTimes; //the above may be truncated due to a clip being partially selected for (WaveClipList::compatibility_iterator it=t->GetClipIterator(); it; it=it->GetNext()) { WaveClip *clip; double clipStartT; double clipEndT; clip = it->GetData(); clipStartT = clip->GetStartTime(); clipEndT = clip->GetEndTime(); if( clipEndT <= startT ) continue; // clip is not within selection if( clipStartT >= startT + lenT ) continue; // clip is not within selection //save the actual clip start/end so that we can rejoin them after we paste. clipRealStartEndTimes.push_back(std::pair(clipStartT,clipEndT)); if( clipStartT < startT ) // does selection cover the whole clip? clipStartT = startT; // don't copy all the new clip if( clipEndT > startT + lenT ) // does selection cover the whole clip? clipEndT = startT + lenT; // don't copy all the new clip //save them clipStartEndTimes.push_back(std::pair(clipStartT,clipEndT)); } //now go thru and replace the old clips with new for(unsigned int i=0;iClear(clipStartEndTimes[i].first,clipStartEndTimes[i].second); // output->Copy(clipStartEndTimes[i].first-startT+offsetT0,clipStartEndTimes[i].second-startT+offsetT0, &toClipOutput); output->Copy(clipStartEndTimes[i].first-startT,clipStartEndTimes[i].second-startT, &toClipOutput); if(toClipOutput) { //put the processed audio in bool bResult = t->Paste(clipStartEndTimes[i].first, toClipOutput); wxASSERT(bResult); // TO DO: Actually handle this. //if the clip was only partially selected, the Paste will have created a split line. Join is needed to take care of this //This is not true when the selection is fully contained within one clip (second half of conditional) if( (clipRealStartEndTimes[i].first != clipStartEndTimes[i].first || clipRealStartEndTimes[i].second != clipStartEndTimes[i].second) && !(clipRealStartEndTimes[i].first <= startT && clipRealStartEndTimes[i].second >= startT+lenT) ) t->Join(clipRealStartEndTimes[i].first,clipRealStartEndTimes[i].second); delete toClipOutput; } } return true; } bool EffectEqualization48x::ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, sampleCount bufferLength) { BufferInfo bufferInfo; bufferInfo.mContiguousBufferSize=bufferLength; bufferInfo.mBufferSouce[0]=sourceBuffer; bufferInfo.mBufferDest[0]=destBuffer; bufferInfo.mScratchBuffer=&sourceBuffer[mSubBufferSize]; return ProcessBuffer1x(&bufferInfo); } bool EffectEqualization48x::ProcessBuffer1x(BufferInfo *bufferInfo) { int bufferCount=bufferInfo->mContiguousBufferSize?1:4; for(int bufferIndex=0;bufferIndexmBufferLength; if(bufferInfo->mContiguousBufferSize) bufferLength=bufferInfo->mContiguousBufferSize; sampleCount blockCount=bufferLength/mBlockSize; sampleCount lastBlockSize=bufferLength%mBlockSize; if(lastBlockSize) blockCount++; float *workBuffer=bufferInfo->mScratchBuffer; // all scratch buffers are at the end float *scratchBuffer=&workBuffer[mWindowSize*2]; // all scratch buffers are at the end float *sourceBuffer=bufferInfo->mBufferSouce[bufferIndex]; float *destBuffer=bufferInfo->mBufferDest[bufferIndex]; for(int runx=0;runxFilter(mWindowSize, currentBuffer); Filter1x(mWindowSize, currentBuffer, scratchBuffer); float *writeEnd=currentBuffer+mBlockSize; if(runx==blockCount) writeEnd=currentBuffer+(lastBlockSize+mFilterSize); if(runx) { float *lastOverrun=&workBuffer[mWindowSize*((runx+1)&1)+mBlockSize]; for(int j=0;j>1; // this will skip the first filterSize on the first run while(currentBufferGetMaxBlockSize(); AudacityProject *p = GetActiveProject(); WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); mEffectEqualization->TrackProgress(count, 0.0); int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active int bigRuns=len/(subBufferSize-mBlockSize); int trackBlocksPerBig=subBufferSize/trackBlockSize; int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize; int singleProcessLength; if(!bigRuns) singleProcessLength=len; else singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize)); sampleCount currentSample=start; bool bBreakLoop = false; for(int bigRun=0;bigRunGet((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize); currentSample+=trackBlockSize; } if(trackLeftovers) { t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); currentSample+=trackLeftovers; } currentSample-=mBlockSize+(mFilterSize>>1); ProcessBuffer1x(mBufferInfo); bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns); if( bBreakLoop ) break; output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1))); } if(singleProcessLength && !bBreakLoop) { t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1)); output->Append((samplePtr)&mBigBuffer[bigRuns?mBlockSize:0], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); } output->Flush(); if(!bBreakLoop) ProcessTail(t, output, start, len); delete output; return bBreakLoop; } void EffectEqualization48x::Filter1x(sampleCount len, float *buffer, float *scratchBuffer) { int i; float real, imag; // Apply FFT RealFFTf1x(buffer, mEffectEqualization->hFFT); // Apply filter // DC component is purely real float filterFuncR, filterFuncI; filterFuncR=mEffectEqualization->mFilterFuncR[0]; scratchBuffer[0]=buffer[0]*filterFuncR; int halfLength=(len/2); bool useBitReverseTable=sMathPath&1; for(i=1; ihFFT->BitReversed[i] ]; imag=buffer[mEffectEqualization->hFFT->BitReversed[i]+1]; } else { int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits); real=buffer[bitReversed]; imag=buffer[bitReversed+1]; } filterFuncR=mEffectEqualization->mFilterFuncR[i]; filterFuncI=mEffectEqualization->mFilterFuncI[i]; scratchBuffer[2*i ] = real*filterFuncR - imag*filterFuncI; scratchBuffer[2*i+1] = real*filterFuncI + imag*filterFuncR; } // Fs/2 component is purely real filterFuncR=mEffectEqualization->mFilterFuncR[halfLength]; scratchBuffer[1] = buffer[1] * filterFuncR; // Inverse FFT and normalization InverseRealFFTf1x(scratchBuffer, mEffectEqualization->hFFT); ReorderToTime1x(mEffectEqualization->hFFT, scratchBuffer, buffer); } bool EffectEqualization48x::ProcessBuffer4x(BufferInfo *bufferInfo) { // length must be a factor of window size for 4x processing. if(bufferInfo->mBufferLength%mBlockSize) return false; sampleCount blockCount=bufferInfo->mBufferLength/mBlockSize; __m128 *readBlocks[4]; // some temps so we dont destroy the vars in the struct __m128 *writeBlocks[4]; for(int i=0;i<4;i++) { readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i]; writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i]; } __m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer; __m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*2]; for(int run4x=0;run4x>2; // swizzle it back. for(int i=writeToStart,j=writeStart;j>2; // these are 128b pointers, each window is 1/4 blockSize for those writeBlocks[i]+=mBlockSize>>2; } } return true; } bool EffectEqualization48x::ProcessOne4x(int count, WaveTrack * t, sampleCount start, sampleCount len) { int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active if(lenGetMaxBlockSize(); AudacityProject *p = GetActiveProject(); WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); mEffectEqualization->TrackProgress(count, 0.0); int bigRuns=len/(subBufferSize-mBlockSize); int trackBlocksPerBig=subBufferSize/trackBlockSize; int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize; int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize)); sampleCount currentSample=start; bool bBreakLoop = false; for(int bigRun=0;bigRunGet((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize); currentSample+=trackBlockSize; } if(trackLeftovers) { t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); currentSample+=trackLeftovers; } currentSample-=mBlockSize+(mFilterSize>>1); ProcessBuffer4x(mBufferInfo); bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns); if( bBreakLoop ) break; output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1))); } if(singleProcessLength && !bBreakLoop) { t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1)); output->Append((samplePtr)&mBigBuffer[bigRuns?mBlockSize:0], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); // output->Append((samplePtr)&mBigBuffer[bigRuns?mBlockSize:0], floatSample, singleProcessLength); } output->Flush(); if(!bBreakLoop) ProcessTail(t, output, start, len); delete output; return bBreakLoop; } void *EQWorker::Entry() { while(!mExitLoop) { mMutex->Lock(); bool bufferAquired=false; for(int i=0;iUnlock(); switch (mProcessingType) { case 1: mEffectEqualization48x->ProcessBuffer1x(&mBufferInfoList[i]); break; case 4: mEffectEqualization48x->ProcessBuffer4x(&mBufferInfoList[i]); break; } mBufferInfoList[i].mBufferStatus=BufferDone; // we're done break; } if(!bufferAquired) mMutex->Unlock(); } return NULL; } bool EffectEqualization48x::ProcessOne1x4xThreaded(int count, WaveTrack * t, sampleCount start, sampleCount len, int processingType) { int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active sampleCount blockCount=len/mBlockSize; if(blockCount<16) // it's not worth 4x processing do a regular process return ProcessOne4x(count, t, start, len); if(mThreadCount<=0 || blockCount<256) // dont do it without cores or big data return ProcessOne4x(count, t, start, len); for(int i=0;iGetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); sampleCount trackBlockSize = t->GetMaxBlockSize(); mEffectEqualization->TrackProgress(count, 0.0); int bigRuns=len/(subBufferSize-mBlockSize); int trackBlocksPerBig=subBufferSize/trackBlockSize; int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize; int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize)); sampleCount currentSample=start; int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0; // fill the first workerDataCount buffers we checked above and there is at least this data int maxPreFill=bigRunsGet((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize); currentSample+=trackBlockSize; } if(trackLeftovers) { t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); currentSample+=trackLeftovers; } currentSample-=mBlockSize+(mFilterSize>>1); mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin } int currentIndex=0; bool bBreakLoop = false; while(bigBlocksWrittenTrackProgress(count, (double)(bigBlocksWritten)/(double)bigRuns); if( bBreakLoop ) break; mDataMutex.Lock(); // Get in line for data // process as many blocks as we can while((mBufferInfo[currentIndex].mBufferStatus==BufferDone) && (bigBlocksWrittenAppend((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1))); bigBlocksWritten++; if(bigBlocksReadGet((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize); currentSample+=trackBlockSize; } if(trackLeftovers) { t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); currentSample+=trackLeftovers; } currentSample-=mBlockSize+(mFilterSize>>1); mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin bigBlocksRead++; } else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unecessary currentIndex=(currentIndex+1)%mWorkerDataCount; } mDataMutex.Unlock(); // Get back in line for data } if(singleProcessLength && !bBreakLoop) { t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1)); output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); } output->Flush(); if(!bBreakLoop) ProcessTail(t, output, start, len); delete output; return bBreakLoop; } void EffectEqualization48x::Filter4x(sampleCount len, float *buffer, float *scratchBuffer) { int i; __m128 real128, imag128; // Apply FFT RealFFTf4x(buffer, mEffectEqualization->hFFT); // Apply filter // DC component is purely real __m128 *localFFTBuffer=(__m128 *)scratchBuffer; __m128 *localBuffer=(__m128 *)buffer; __m128 filterFuncR, filterFuncI; filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[0]); localFFTBuffer[0]=_mm_mul_ps(localBuffer[0], filterFuncR); int halfLength=(len/2); bool useBitReverseTable=sMathPath&1; for(i=1; ihFFT->BitReversed[i] ]; imag128=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1]; } else { int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits); real128=localBuffer[bitReversed]; imag128=localBuffer[bitReversed+1]; } filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[i]); filterFuncI=_mm_set1_ps(mEffectEqualization->mFilterFuncI[i]); localFFTBuffer[2*i ] = _mm_sub_ps( _mm_mul_ps(real128, filterFuncR), _mm_mul_ps(imag128, filterFuncI)); localFFTBuffer[2*i+1] = _mm_add_ps( _mm_mul_ps(real128, filterFuncI), _mm_mul_ps(imag128, filterFuncR)); } // Fs/2 component is purely real filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]); localFFTBuffer[1] = _mm_mul_ps(localBuffer[1], filterFuncR); // Inverse FFT and normalization InverseRealFFTf4x(scratchBuffer, mEffectEqualization->hFFT); ReorderToTime4x(mEffectEqualization->hFFT, scratchBuffer, buffer); } #ifdef __AVX_ENABLED // note although written it has not been tested bool EffectEqualization48x::ProcessBuffer8x(BufferInfo *bufferInfo) { // length must be a factor of window size for 4x processing. if(bufferInfo->mBufferLength%mBlockSize || mBufferCount!=8) return false; sampleCount blockCount=bufferInfo->mBufferLength/mBlockSize; __m128 *readBlocks[8]; // some temps so we dont destroy the vars in the struct __m128 *writeBlocks[8]; for(int i=0;i<8;i++) { readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i]; writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i]; } __m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer; __m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*4]; int doubleFilter=mFilterSize<<1; int doubleWindow=mWindowSize<<1; int doubleBlock=mBlockSize<<1; for(int run4x=0;run4x>2; // swizzle it back. for(int i=writeToStart,j=writeStart;j>2; // these are 128b pointers, each window is 1/4 blockSize for those writeBlocks[i]+=mBlockSize>>2; } } return true; } bool EffectEqualization48x::ProcessOne8x(int count, WaveTrack * t, sampleCount start, sampleCount len) { sampleCount blockCount=len/mBlockSize; if(blockCount<32) // it's not worth 8x processing do a regular process return ProcessOne4x(count, t, start, len); sampleCount trackBlockSize = t->GetMaxBlockSize(); AudacityProject *p = GetActiveProject(); WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); mEffectEqualization->TrackProgress(count, 0.0); int bigRuns=len/(mSubBufferSize-mBlockSize); int trackBlocksPerBig=mSubBufferSize/trackBlockSize; int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize; int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize)); sampleCount currentSample=start; bool bBreakLoop = false; for(int bigRun=0;bigRunGet((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize); currentSample+=trackBlockSize; } if(trackLeftovers) { t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); currentSample+=trackLeftovers; } currentSample-=mBlockSize+(mFilterSize>>1); ProcessBuffer4x(mBufferInfo); if (bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns)) { break; } output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1))); } if(singleProcessLength && !bBreakLoop) { t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1)); output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); } output->Flush(); if(!bBreakLoop) ProcessTail(t, output, start, len); delete output; return bBreakLoop; } bool EffectEqualization48x::ProcessOne8xThreaded(int count, WaveTrack * t, sampleCount start, sampleCount len) { sampleCount blockCount=len/mBlockSize; if(blockCount<16) // it's not worth 4x processing do a regular process return ProcessOne4x(count, t, start, len); if(mThreadCount<=0 || blockCount<256) // dont do it without cores or big data return ProcessOne4x(count, t, start, len); AudacityProject *p = GetActiveProject(); WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); sampleCount trackBlockSize = t->GetMaxBlockSize(); mEffectEqualization->TrackProgress(count, 0.0); int bigRuns=len/(mSubBufferSize-mBlockSize); int trackBlocksPerBig=mSubBufferSize/trackBlockSize; int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize; int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize)); sampleCount currentSample=start; int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0; // fill the first workerDataCount buffers we checked above and there is at least this data for(int i=0;iGet((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize); currentSample+=trackBlockSize; } if(trackLeftovers) { t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); currentSample+=trackLeftovers; } currentSample-=mBlockSize+(mFilterSize>>1); mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin } int currentIndex=0; bool bBreakLoop = false; while(bigBlocksWrittenTrackProgress(count, (double)(bigBlocksWritten)/(double)bigRuns)) { break; } mDataMutex.Lock(); // Get in line for data // process as many blocks as we can while((mBufferInfo[currentIndex].mBufferStatus==BufferDone) && (bigBlocksWrittenAppend((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1))); bigBlocksWritten++; if(bigBlocksReadGet((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize); currentSample+=trackBlockSize; } if(trackLeftovers) { t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); currentSample+=trackLeftovers; } currentSample-=mBlockSize+(mFilterSize>>1); mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin bigBlocksRead++; } else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unecessary currentIndex=(currentIndex+1)%mWorkerDataCount; } mDataMutex.Unlock(); // Get back in line for data } if(singleProcessLength && !bBreakLoop) { t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1)); output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); } output->Flush(); if(!bBreakLoop) ProcessTail(t, output, start, len); delete output; return bBreakLoop; } void EffectEqualization48x::Filter8x(sampleCount len, float *buffer, float *scratchBuffer) { int i; __m256 real256, imag256; // Apply FFT RealFFTf8x(buffer, mEffectEqualization->hFFT); // Apply filter // DC component is purely real __m256 *localFFTBuffer=(__m256 *)scratchBuffer; __m256 *localBuffer=(__m256 *)buffer; __m256 filterFuncR, filterFuncI; filterFuncR=_mm256_set1_ps(mEffectEqualization->mFilterFuncR[0]); localFFTBuffer[0]=_mm256_mul_ps(localBuffer[0], filterFuncR); int halfLength=(len/2); bool useBitReverseTable=sMathPath&1; for(i=1; ihFFT->BitReversed[i] ]; imag256=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1]; } else { int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits); real256=localBuffer[bitReversed]; imag256=localBuffer[bitReversed+1]; } filterFuncR=_mm256_set1_ps(mEffectEqualization->mFilterFuncR[i]); filterFuncI=_mm256_set1_ps(mEffectEqualization->mFilterFuncI[i]); localFFTBuffer[2*i ] = _mm256_sub_ps( _mm256_mul_ps(real256, filterFuncR), _mm256_mul_ps(imag256, filterFuncI)); localFFTBuffer[2*i+1] = _mm256_add_ps( _mm256_mul_ps(real256, filterFuncI), _mm256_mul_ps(imag256, filterFuncR)); } // Fs/2 component is purely real filterFuncR=_mm256_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]); localFFTBuffer[1] = _mm256_mul_ps(localBuffer[1], filterFuncR); // Inverse FFT and normalization InverseRealFFTf8x(scratchBuffer, mEffectEqualization->hFFT); ReorderToTime8x(mEffectEqualization->hFFT, scratchBuffer, buffer); } #endif #endif