Skip to content

Commit

Permalink
More efficient executable allocator (#83632)
Browse files Browse the repository at this point in the history
* Add multi-element caching scheme to ExecutableAllocator

* CodePageGenerator generated code for LoaderHeaps should not cache the RW mapping created, as it is never accessed again, and caching it is evicting useful data from the cache

* Fix build breaks introduced by executable mapping cache changes

* Fix build breaks caused by changes to introduce the concept of executable pages that aren't cached ever

* Move VSD executable heaps from being LoaderHeaps to being CodeFragmentHeaps
- Should reduce the amount of contention on the ExecutableAllocator cache
- Will improve the performance of identifying what type of stub is in use by avoiding the RangeList structure
  - Note, this will only apply to stubs which are used in somewhat larger applications

* Add statistics gathering features to ExecutableAllocator

* In progress

* Fix Dac api failing when called early in process startup

* Implement interleaved stubs as 16KB pages instead of 4KB pages

* Remove api incorrectly added

* Adjust cache size down to 3, and leave a breadcrumb for enabling more cache size exploration

* Fix x86 build

* Tweaks to make it all build and fix some bugs
- Notably, arm32 is now only using 4K pages as before, as it can't generate the proper immediate as needed.

* Add statistics for linked list walk lengths

* Reorder linked list on access

* Fix some more asserts and build breaks

* Fix Arm build for real this time, and fix unix arm64 miscalculation of which stubs to use

* Update based on code review comments

* More code review feedback

* Fix oops

* Attempt to fix Unix Arm64 build

* Try tweaking the number of cached mappings to see if the illegal instruction signal will go away in our testing

* Revert "Try tweaking the number of cached mappings to see if the illegal instruction signal will go away in our testing"

This reverts commit 9838460.

* Fix last code review comment
  • Loading branch information
davidwrighton authored Apr 13, 2023
1 parent e73d7fc commit 11a0671
Show file tree
Hide file tree
Showing 42 changed files with 419 additions and 867 deletions.
4 changes: 0 additions & 4 deletions src/coreclr/debug/daccess/dacdbiimpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3617,11 +3617,7 @@ void DacDbiInterfaceImpl::EnumerateMemRangesForLoaderAllocator(PTR_LoaderAllocat
if (pVcsMgr)
{
if (pVcsMgr->indcell_heap != NULL) heapsToEnumerate.Push(pVcsMgr->indcell_heap);
if (pVcsMgr->lookup_heap != NULL) heapsToEnumerate.Push(pVcsMgr->lookup_heap);
if (pVcsMgr->resolve_heap != NULL) heapsToEnumerate.Push(pVcsMgr->resolve_heap);
if (pVcsMgr->dispatch_heap != NULL) heapsToEnumerate.Push(pVcsMgr->dispatch_heap);
if (pVcsMgr->cache_entry_heap != NULL) heapsToEnumerate.Push(pVcsMgr->cache_entry_heap);
if (pVcsMgr->vtable_heap != NULL) heapsToEnumerate.Push(pVcsMgr->vtable_heap);
}

TADDR rangeAccumAsTaddr = TO_TADDR(rangeAcummulator);
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/debug/daccess/fntableaccess.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ struct FakeHeapList
DWORD_PTR pHdrMap; // changed from DWORD*
size_t maxCodeHeapSize;
size_t reserveForJumpStubs;
DWORD_PTR pLoaderAllocator;
#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
DWORD_PTR CLRPersonalityRoutine;
#endif
Expand Down
24 changes: 0 additions & 24 deletions src/coreclr/debug/daccess/request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3531,26 +3531,10 @@ ClrDataAccess::TraverseVirtCallStubHeap(CLRDATA_ADDRESS pAppDomain, VCSHeapType
pLoaderHeap = pVcsMgr->indcell_heap;
break;

case LookupHeap:
pLoaderHeap = pVcsMgr->lookup_heap;
break;

case ResolveHeap:
pLoaderHeap = pVcsMgr->resolve_heap;
break;

case DispatchHeap:
pLoaderHeap = pVcsMgr->dispatch_heap;
break;

case CacheEntryHeap:
pLoaderHeap = pVcsMgr->cache_entry_heap;
break;

case VtableHeap:
pLoaderHeap = pVcsMgr->vtable_heap;
break;

default:
hr = E_INVALIDARG;
}
Expand Down Expand Up @@ -3597,11 +3581,7 @@ static const char *LoaderAllocatorLoaderHeapNames[] =
"FixupPrecodeHeap",
"NewStubPrecodeHeap",
"IndcellHeap",
"LookupHeap",
"ResolveHeap",
"DispatchHeap",
"CacheEntryHeap",
"VtableHeap",
};


Expand Down Expand Up @@ -3644,11 +3624,7 @@ HRESULT ClrDataAccess::GetLoaderAllocatorHeaps(CLRDATA_ADDRESS loaderAllocatorAd
else
{
pLoaderHeaps[i++] = HOST_CDADDR(pVcsMgr->indcell_heap);
pLoaderHeaps[i++] = HOST_CDADDR(pVcsMgr->lookup_heap);
pLoaderHeaps[i++] = HOST_CDADDR(pVcsMgr->resolve_heap);
pLoaderHeaps[i++] = HOST_CDADDR(pVcsMgr->dispatch_heap);
pLoaderHeaps[i++] = HOST_CDADDR(pVcsMgr->cache_entry_heap);
pLoaderHeaps[i++] = HOST_CDADDR(pVcsMgr->vtable_heap);
}

// All of the above are "LoaderHeap" and not the ExplicitControl version.
Expand Down
48 changes: 38 additions & 10 deletions src/coreclr/inc/executableallocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@
// This class is responsible for allocation of all the executable memory in the runtime.
class ExecutableAllocator
{
public:

enum CacheableMapping
{
AddToCache,
DoNotAddToCache,
};

private:

// RX address range block descriptor
struct BlockRX
{
Expand Down Expand Up @@ -61,6 +71,11 @@ class ExecutableAllocator

static int64_t g_releaseCount;
static int64_t g_reserveCount;

static int64_t g_MapRW_Calls;
static int64_t g_MapRW_CallsWithCacheMiss;
static int64_t g_MapRW_LinkedListWalkDepth;
static int64_t g_LinkedListTotalDepth;
#endif
// Instance of the allocator
static ExecutableAllocator* g_instance;
Expand Down Expand Up @@ -102,9 +117,19 @@ class ExecutableAllocator
// for platforms that don't use shared memory.
size_t m_freeOffset = 0;

// Last RW mapping cached so that it can be reused for the next mapping
// Uncomment these to gather information to better choose caching parameters
//#define VARIABLE_SIZED_CACHEDMAPPING_SIZE

// Last RW mappings cached so that it can be reused for the next mapping
// request if it goes into the same range.
BlockRW* m_cachedMapping = NULL;
// This is handled as a 3 element cache with an LRU replacement policy
#ifdef VARIABLE_SIZED_CACHEDMAPPING_SIZE
// If variable sized mappings enabled, make the cache physically big enough to cover all interesting sizes
static int g_cachedMappingSize;
BlockRW* m_cachedMapping[16] = { 0 };
#else
BlockRW* m_cachedMapping[3] = { 0 };
#endif

// Synchronization of the public allocator methods
CRITSEC_COOKIE m_CriticalSection;
Expand All @@ -114,15 +139,18 @@ class ExecutableAllocator
// and replaces it by the passed in one.
void UpdateCachedMapping(BlockRW *pBlock);

// Remove the cached mapping
void RemoveCachedMapping();
// Remove the cached mapping (1 based indexing)
void RemoveCachedMapping(size_t indexToRemove);

// Find an overlapped cached mapping with pBlock, or return 0
size_t FindOverlappingCachedMapping(BlockRX* pBlock);

// Find existing RW block that maps the whole specified range of RX memory.
// Return NULL if no such block exists.
void* FindRWBlock(void* baseRX, size_t size);
void* FindRWBlock(void* baseRX, size_t size, CacheableMapping cacheMapping);

// Add RW block to the list of existing RW blocks
bool AddRWBlock(void* baseRW, void* baseRX, size_t size);
bool AddRWBlock(void* baseRW, void* baseRX, size_t size, CacheableMapping cacheMapping);

// Remove RW block from the list of existing RW blocks and return the base
// address and size the underlying memory was mapped at.
Expand Down Expand Up @@ -230,7 +258,7 @@ class ExecutableAllocator
void Release(void* pRX);

// Map the specified block of executable memory as RW
void* MapRW(void* pRX, size_t size);
void* MapRW(void* pRX, size_t size, CacheableMapping cacheMapping);

// Unmap the RW mapping at the specified address
void UnmapRW(void* pRW);
Expand Down Expand Up @@ -290,14 +318,14 @@ class ExecutableWriterHolder
{
}

ExecutableWriterHolder(T* addressRX, size_t size)
ExecutableWriterHolder(T* addressRX, size_t size, ExecutableAllocator::CacheableMapping cacheMapping = ExecutableAllocator::AddToCache)
{
m_addressRX = addressRX;
#if defined(HOST_OSX) && defined(HOST_ARM64)
m_addressRW = addressRX;
PAL_JitWriteProtect(true);
#else
m_addressRW = (T *)ExecutableAllocator::Instance()->MapRW((void*)addressRX, size);
m_addressRW = (T *)ExecutableAllocator::Instance()->MapRW((void*)addressRX, size, cacheMapping);
#endif
}

Expand All @@ -320,7 +348,7 @@ class ExecutableWriterHolder

#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS
#undef ExecutableWriterHolder
#ifdef TARGET_UNIX
#ifdef HOST_UNIX
#define ExecutableWriterHolder ExecutableAllocator::LogUsage(__FILE__, __LINE__, __PRETTY_FUNCTION__); ExecutableWriterHolderNoLog
#define AssignExecutableWriterHolder(addressRX, size) AssignExecutableWriterHolder(addressRX, size); ExecutableAllocator::LogUsage(__FILE__, __LINE__, __PRETTY_FUNCTION__);
#else
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/inc/holder.h
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ FORCEINLINE void StubRelease(TYPE* value)
if (value)
{
#ifdef LOG_EXECUTABLE_ALLOCATOR_STATISTICS
#ifdef TARGET_UNIX
#ifdef HOST_UNIX
LOGGER::LogUsage(__FILE__, __LINE__, __PRETTY_FUNCTION__);
#else
LOGGER::LogUsage(__FILE__, __LINE__, __FUNCTION__);
Expand Down
20 changes: 16 additions & 4 deletions src/coreclr/inc/loaderheap.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ struct LoaderHeapEvent;



// When an interleaved LoaderHeap is constructed, this is the interleaving size
inline UINT32 GetStubCodePageSize()
{
#if defined(TARGET_ARM64) && defined(TARGET_UNIX)
return max(16*1024, GetOsPageSize());
#elif defined(TARGET_ARM)
return 4096; // ARM is special as the 32bit instruction set does not easily permit a 16KB offset
#else
return 16*1024;
#endif
}



Expand Down Expand Up @@ -185,6 +196,7 @@ class UnlockedLoaderHeap
{
#ifdef _DEBUG
friend class LoaderHeapSniffer;
friend struct LoaderHeapFreeBlock;
#endif

#ifdef DACCESS_COMPILE
Expand Down Expand Up @@ -276,7 +288,7 @@ class UnlockedLoaderHeap

public:
BOOL m_fExplicitControl; // Am I a LoaderHeap or an ExplicitControlLoaderHeap?
void (*m_codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX);
void (*m_codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size);

#ifdef DACCESS_COMPILE
public:
Expand All @@ -298,7 +310,7 @@ class UnlockedLoaderHeap
SIZE_T dwReservedRegionSize,
RangeList *pRangeList = NULL,
HeapKind kind = HeapKind::Data,
void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX) = NULL,
void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size) = NULL,
DWORD dwGranularity = 1);

~UnlockedLoaderHeap();
Expand Down Expand Up @@ -467,7 +479,7 @@ class LoaderHeap : public UnlockedLoaderHeap, public ILoaderHeapBackout
RangeList *pRangeList = NULL,
UnlockedLoaderHeap::HeapKind kind = UnlockedLoaderHeap::HeapKind::Data,
BOOL fUnlocked = FALSE,
void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX) = NULL,
void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size) = NULL,
DWORD dwGranularity = 1
)
: UnlockedLoaderHeap(dwReserveBlockSize,
Expand All @@ -491,7 +503,7 @@ class LoaderHeap : public UnlockedLoaderHeap, public ILoaderHeapBackout
RangeList *pRangeList = NULL,
UnlockedLoaderHeap::HeapKind kind = UnlockedLoaderHeap::HeapKind::Data,
BOOL fUnlocked = FALSE,
void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX) = NULL,
void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size) = NULL,
DWORD dwGranularity = 1
)
: UnlockedLoaderHeap(dwReserveBlockSize,
Expand Down
Loading

0 comments on commit 11a0671

Please sign in to comment.