ss928_framework/thridpart/ncnn/include/allocator.h

445 lines
13 KiB
C
Raw Permalink Normal View History

2024-12-16 13:31:45 +08:00
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#ifndef NCNN_ALLOCATOR_H
#define NCNN_ALLOCATOR_H
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#include "platform.h"
#include <stdlib.h>
#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 26
#include <android/hardware_buffer.h>
#endif // __ANDROID_API__ >= 26
#endif // NCNN_PLATFORM_API
namespace ncnn {
// the alignment of all the allocated buffers
#if NCNN_AVX512
#define NCNN_MALLOC_ALIGN 64
#elif NCNN_AVX
#define NCNN_MALLOC_ALIGN 32
#else
#define NCNN_MALLOC_ALIGN 16
#endif
// we have some optimized kernels that may overread buffer a bit in loop
// it is common to interleave next-loop data load with arithmetic instructions
// allocating more bytes keeps us safe from SEGV_ACCERR failure
#define NCNN_MALLOC_OVERREAD 64
// Aligns a pointer to the specified number of bytes
// ptr Aligned pointer
// n Alignment size that must be a power of two
template<typename _Tp>
static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
{
return (_Tp*)(((size_t)ptr + n - 1) & -n);
}
// Aligns a buffer size to the specified number of bytes
// The function returns the minimum number that is greater or equal to sz and is divisible by n
// sz Buffer size to align
// n Alignment size that must be a power of two
static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
{
return (sz + n - 1) & -n;
}
static NCNN_FORCEINLINE void* fastMalloc(size_t size)
{
#if _MSC_VER
return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
void* ptr = 0;
if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
ptr = 0;
return ptr;
#elif __ANDROID__ && __ANDROID_API__ < 17
return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
#else
unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
if (!udata)
return 0;
unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
adata[-1] = udata;
return adata;
#endif
}
static NCNN_FORCEINLINE void fastFree(void* ptr)
{
if (ptr)
{
#if _MSC_VER
_aligned_free(ptr);
#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
free(ptr);
#elif __ANDROID__ && __ANDROID_API__ < 17
free(ptr);
#else
unsigned char* udata = ((unsigned char**)ptr)[-1];
free(udata);
#endif
}
}
#if NCNN_THREADS
// exchange-add operation for atomic operations on reference counters
#if defined __riscv && !defined __riscv_atomic
// riscv target without A extension
static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
{
int tmp = *addr;
*addr += delta;
return tmp;
}
#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
// atomic increment on the linux version of the Intel(tm) compiler
#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
#elif defined __GNUC__
#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
#ifdef __ATOMIC_ACQ_REL
#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
#else
#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
#endif
#else
#if defined __ATOMIC_ACQ_REL && !defined __clang__
// version for gcc >= 4.7
#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
#else
#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
#endif
#endif
#elif defined _MSC_VER && !defined RC_INVOKED
#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
#else
// thread-unsafe branch
static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
{
int tmp = *addr;
*addr += delta;
return tmp;
}
#endif
#else // NCNN_THREADS
static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
{
int tmp = *addr;
*addr += delta;
return tmp;
}
#endif // NCNN_THREADS
class NCNN_EXPORT Allocator
{
public:
virtual ~Allocator();
virtual void* fastMalloc(size_t size) = 0;
virtual void fastFree(void* ptr) = 0;
};
class PoolAllocatorPrivate;
class NCNN_EXPORT PoolAllocator : public Allocator
{
public:
PoolAllocator();
~PoolAllocator();
// ratio range 0 ~ 1
// default cr = 0
void set_size_compare_ratio(float scr);
// budget drop threshold
// default threshold = 10
void set_size_drop_threshold(size_t);
// release all budgets immediately
void clear();
virtual void* fastMalloc(size_t size);
virtual void fastFree(void* ptr);
private:
PoolAllocator(const PoolAllocator&);
PoolAllocator& operator=(const PoolAllocator&);
private:
PoolAllocatorPrivate* const d;
};
class UnlockedPoolAllocatorPrivate;
class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
{
public:
UnlockedPoolAllocator();
~UnlockedPoolAllocator();
// ratio range 0 ~ 1
// default cr = 0
void set_size_compare_ratio(float scr);
// budget drop threshold
// default threshold = 10
void set_size_drop_threshold(size_t);
// release all budgets immediately
void clear();
virtual void* fastMalloc(size_t size);
virtual void fastFree(void* ptr);
private:
UnlockedPoolAllocator(const UnlockedPoolAllocator&);
UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
private:
UnlockedPoolAllocatorPrivate* const d;
};
#if NCNN_VULKAN
class VulkanDevice;
class NCNN_EXPORT VkBufferMemory
{
public:
VkBuffer buffer;
// the base offset assigned by allocator
size_t offset;
size_t capacity;
VkDeviceMemory memory;
void* mapped_ptr;
// buffer state, modified by command functions internally
mutable VkAccessFlags access_flags;
mutable VkPipelineStageFlags stage_flags;
// initialize and modified by mat
int refcount;
};
class NCNN_EXPORT VkImageMemory
{
public:
VkImage image;
VkImageView imageview;
// underlying info assigned by allocator
int width;
int height;
int depth;
VkFormat format;
VkDeviceMemory memory;
void* mapped_ptr;
// the base offset assigned by allocator
size_t bind_offset;
size_t bind_capacity;
// image state, modified by command functions internally
mutable VkAccessFlags access_flags;
mutable VkImageLayout image_layout;
mutable VkPipelineStageFlags stage_flags;
// in-execution state, modified by command functions internally
mutable int command_refcount;
// initialize and modified by mat
int refcount;
};
class NCNN_EXPORT VkAllocator
{
public:
explicit VkAllocator(const VulkanDevice* _vkdev);
virtual ~VkAllocator();
virtual void clear();
virtual VkBufferMemory* fastMalloc(size_t size) = 0;
virtual void fastFree(VkBufferMemory* ptr) = 0;
virtual int flush(VkBufferMemory* ptr);
virtual int invalidate(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
virtual void fastFree(VkImageMemory* ptr) = 0;
public:
const VulkanDevice* vkdev;
uint32_t buffer_memory_type_index;
uint32_t image_memory_type_index;
uint32_t reserved_type_index;
bool mappable;
bool coherent;
protected:
VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
VkImageView create_imageview(VkImage image, VkFormat format);
};
class VkBlobAllocatorPrivate;
class NCNN_EXPORT VkBlobAllocator : public VkAllocator
{
public:
explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
virtual ~VkBlobAllocator();
public:
// release all budgets immediately
virtual void clear();
virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);
private:
VkBlobAllocator(const VkBlobAllocator&);
VkBlobAllocator& operator=(const VkBlobAllocator&);
private:
VkBlobAllocatorPrivate* const d;
};
class VkWeightAllocatorPrivate;
class NCNN_EXPORT VkWeightAllocator : public VkAllocator
{
public:
explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
virtual ~VkWeightAllocator();
public:
// release all blocks immediately
virtual void clear();
public:
virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);
private:
VkWeightAllocator(const VkWeightAllocator&);
VkWeightAllocator& operator=(const VkWeightAllocator&);
private:
VkWeightAllocatorPrivate* const d;
};
class VkStagingAllocatorPrivate;
class NCNN_EXPORT VkStagingAllocator : public VkAllocator
{
public:
explicit VkStagingAllocator(const VulkanDevice* vkdev);
virtual ~VkStagingAllocator();
public:
// ratio range 0 ~ 1
// default cr = 0.75
void set_size_compare_ratio(float scr);
// release all budgets immediately
virtual void clear();
virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);
private:
VkStagingAllocator(const VkStagingAllocator&);
VkStagingAllocator& operator=(const VkStagingAllocator&);
private:
VkStagingAllocatorPrivate* const d;
};
class VkWeightStagingAllocatorPrivate;
class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
{
public:
explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
virtual ~VkWeightStagingAllocator();
public:
virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);
private:
VkWeightStagingAllocator(const VkWeightStagingAllocator&);
VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
private:
VkWeightStagingAllocatorPrivate* const d;
};
#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 26
class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
{
public:
VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
virtual ~VkAndroidHardwareBufferImageAllocator();
public:
virtual VkBufferMemory* fastMalloc(size_t size);
virtual void fastFree(VkBufferMemory* ptr);
virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
virtual void fastFree(VkImageMemory* ptr);
private:
VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
public:
int init();
int width() const;
int height() const;
uint64_t external_format() const;
public:
AHardwareBuffer* hb;
AHardwareBuffer_Desc bufferDesc;
VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
};
#endif // __ANDROID_API__ >= 26
#endif // NCNN_PLATFORM_API
#endif // NCNN_VULKAN
} // namespace ncnn
#endif // NCNN_ALLOCATOR_H