Skip to content

Commit

Permalink
Fix QueryPerformanceCounter eating more than half the CPU on AMD Win-…
Browse files Browse the repository at this point in the history
…x86_64

The issue was in fast timers, making the naming highly ironic.
Rewrote to use RDTSC intrinsic. Doubled the framerate.
  • Loading branch information
siana committed Nov 17, 2015
1 parent 62a7704 commit c358ffe
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 64 deletions.
28 changes: 5 additions & 23 deletions indra/llcommon/llfasttimer_class.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@

#if LL_WINDOWS
#include "lltimer.h"
#include <intrin.h>
#elif LL_LINUX || LL_SOLARIS
#include <sys/time.h>
#include <sched.h>
Expand Down Expand Up @@ -184,7 +185,7 @@ LLMutex* LLFastTimer::sLogLock = NULL;
std::queue<LLSD> LLFastTimer::sLogQueue;
const int LLFastTimer::NamedTimer::HISTORY_NUM = 300;

#if defined(LL_WINDOWS) && !defined(_WIN64)
#if defined(LL_WINDOWS)
#define USE_RDTSC 1
#endif

Expand Down Expand Up @@ -952,34 +953,15 @@ LLFastTimer::LLFastTimer(LLFastTimer::FrameState* state)
#if USE_RDTSC
U32 LLFastTimer::getCPUClockCount32()
{
U32 ret_val;
__asm
{
_emit 0x0f
_emit 0x31
shr eax,8
shl edx,24
or eax, edx
mov dword ptr [ret_val], eax
}
return ret_val;
return (U32)(__rdtsc()>>8);
}

// return full timer value, *not* shifted by 8 bits
U64 LLFastTimer::getCPUClockCount64()
{
U64 ret_val;
__asm
{
_emit 0x0f
_emit 0x31
mov eax,eax
mov edx,edx
mov dword ptr [ret_val+4], edx
mov dword ptr [ret_val], eax
}
return ret_val;
return (U64)__rdtsc();
}

#else
//LL_COMMON_API U64 get_clock_count(); // in lltimer.cpp
// These use QueryPerformanceCounter, which is arguably fine and also works on AMD architectures.
Expand Down
82 changes: 41 additions & 41 deletions indra/llcommon/lltimer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ void ms_sleep(U32 ms)

U32 micro_sleep(U64 us, U32 max_yields)
{
// max_yields is unused; just fiddle with it to avoid warnings.
max_yields = 0;
ms_sleep(us / 1000);
return 0;
// max_yields is unused; just fiddle with it to avoid warnings.
max_yields = 0;
ms_sleep(us / 1000);
return 0;
}
#elif LL_LINUX || LL_SOLARIS || LL_DARWIN
static void _sleep_loop(struct timespec& thiswait)
Expand All @@ -103,8 +103,8 @@ static void _sleep_loop(struct timespec& thiswait)
if (sleep_more)
{
if ( nextwait.tv_sec > thiswait.tv_sec ||
(nextwait.tv_sec == thiswait.tv_sec &&
nextwait.tv_nsec >= thiswait.tv_nsec) )
(nextwait.tv_sec == thiswait.tv_sec &&
nextwait.tv_nsec >= thiswait.tv_nsec) )
{
// if the remaining time isn't actually going
// down then we're being shafted by low clock
Expand All @@ -130,31 +130,31 @@ static void _sleep_loop(struct timespec& thiswait)

U32 micro_sleep(U64 us, U32 max_yields)
{
U64 start = get_clock_count();
// This is kernel dependent. Currently, our kernel generates software clock
// interrupts at 250 Hz (every 4,000 microseconds).
const U64 KERNEL_SLEEP_INTERVAL_US = 4000;

S32 num_sleep_intervals = (us - (KERNEL_SLEEP_INTERVAL_US >> 1)) / KERNEL_SLEEP_INTERVAL_US;
if (num_sleep_intervals > 0)
{
U64 sleep_time = (num_sleep_intervals * KERNEL_SLEEP_INTERVAL_US) - (KERNEL_SLEEP_INTERVAL_US >> 1);
struct timespec thiswait;
thiswait.tv_sec = sleep_time / 1000000;
thiswait.tv_nsec = (sleep_time % 1000000) * 1000l;
_sleep_loop(thiswait);
}

U64 current_clock = get_clock_count();
U32 yields = 0;
while ( (yields < max_yields)
&& (current_clock - start < us) )
{
sched_yield();
++yields;
current_clock = get_clock_count();
}
return yields;
U64 start = get_clock_count();
// This is kernel dependent. Currently, our kernel generates software clock
// interrupts at 250 Hz (every 4,000 microseconds).
const U64 KERNEL_SLEEP_INTERVAL_US = 4000;

S32 num_sleep_intervals = (us - (KERNEL_SLEEP_INTERVAL_US >> 1)) / KERNEL_SLEEP_INTERVAL_US;
if (num_sleep_intervals > 0)
{
U64 sleep_time = (num_sleep_intervals * KERNEL_SLEEP_INTERVAL_US) - (KERNEL_SLEEP_INTERVAL_US >> 1);
struct timespec thiswait;
thiswait.tv_sec = sleep_time / 1000000;
thiswait.tv_nsec = (sleep_time % 1000000) * 1000l;
_sleep_loop(thiswait);
}

U64 current_clock = get_clock_count();
U32 yields = 0;
while ( (yields < max_yields)
&& (current_clock - start < us) )
{
sched_yield();
++yields;
current_clock = get_clock_count();
}
return yields;
}

void ms_sleep(U32 ms)
Expand All @@ -163,7 +163,7 @@ void ms_sleep(U32 ms)
struct timespec thiswait;
thiswait.tv_sec = ms / 1000;
thiswait.tv_nsec = (mslong % 1000) * 1000000l;
_sleep_loop(thiswait);
_sleep_loop(thiswait);
}
#else
# error "architecture not supported"
Expand Down Expand Up @@ -411,15 +411,15 @@ BOOL LLTimer::knownBadTimer()

#if LL_WINDOWS
WCHAR bad_pci_list[][10] = {L"1039:0530",
L"1039:0620",
L"10B9:0533",
L"10B9:1533",
L"1106:0596",
L"1106:0686",
L"1166:004F",
L"1166:0050",
L"8086:7110",
L"\0"
L"1039:0620",
L"10B9:0533",
L"10B9:1533",
L"1106:0596",
L"1106:0686",
L"1166:004F",
L"1166:0050",
L"8086:7110",
L"\0"
};

HKEY hKey = NULL;
Expand Down

0 comments on commit c358ffe

Please sign in to comment.