diff options
Diffstat (limited to '')
-rw-r--r-- | external/db_drivers/liblmdb/mdb.c | 1657 |
1 files changed, 948 insertions, 709 deletions
diff --git a/external/db_drivers/liblmdb/mdb.c b/external/db_drivers/liblmdb/mdb.c index 8e2b1eca1..3b2745f95 100644 --- a/external/db_drivers/liblmdb/mdb.c +++ b/external/db_drivers/liblmdb/mdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2016 Howard Chu, Symas Corp. + * Copyright 2011-2019 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,6 +41,7 @@ #ifdef _WIN32 #include <malloc.h> #include <windows.h> +#include <wchar.h> /* get wcscpy() */ /* We use native NT APIs to setup the memory map, so that we can * let the DB file grow incrementally instead of always preallocating @@ -51,7 +52,7 @@ * NTDLL.DLL at runtime, to avoid buildtime dependencies on any * NTDLL import libraries. */ -typedef NTSTATUS WINAPI (NtCreateSectionFunc) +typedef NTSTATUS (WINAPI NtCreateSectionFunc) (OUT PHANDLE sh, IN ACCESS_MASK acc, IN void * oa OPTIONAL, IN PLARGE_INTEGER ms OPTIONAL, @@ -64,7 +65,7 @@ typedef enum _SECTION_INHERIT { ViewUnmap = 2 } SECTION_INHERIT; -typedef NTSTATUS WINAPI (NtMapViewOfSectionFunc) +typedef NTSTATUS (WINAPI NtMapViewOfSectionFunc) (IN PHANDLE sh, IN HANDLE ph, IN OUT PVOID *addr, IN ULONG_PTR zbits, IN SIZE_T cs, IN OUT PLARGE_INTEGER off OPTIONAL, @@ -73,7 +74,7 @@ typedef NTSTATUS WINAPI (NtMapViewOfSectionFunc) static NtMapViewOfSectionFunc *NtMapViewOfSection; -typedef NTSTATUS WINAPI (NtCloseFunc)(HANDLE h); +typedef NTSTATUS (WINAPI NtCloseFunc)(HANDLE h); static NtCloseFunc *NtClose; @@ -147,6 +148,10 @@ typedef SSIZE_T ssize_t; /* Most platforms have posix_memalign, older may only have memalign */ #define HAVE_MEMALIGN 1 #include <malloc.h> +/* On Solaris, we need the POSIX sigwait function */ +#if defined (__sun) +# define _POSIX_PTHREAD_SEMANTICS 1 +#endif #endif #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) @@ -154,7 +159,7 @@ typedef SSIZE_T ssize_t; #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */ #endif -#if defined(__APPLE__) || defined (BSD) +#if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) # if !(defined(MDB_USE_POSIX_MUTEX) || defined(MDB_USE_POSIX_SEM)) # define MDB_USE_SYSV_SEM 1 # endif @@ -165,6 +170,7 @@ typedef SSIZE_T ssize_t; #ifndef _WIN32 #include <pthread.h> +#include <signal.h> #ifdef MDB_USE_POSIX_SEM # define MDB_USE_HASH 1 #include <semaphore.h> @@ -234,7 +240,7 @@ union semun { #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) # error "Unknown or unsupported endianness (BYTE_ORDER)" -#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +#elif (-6 & 5) || CHAR_BIT!=8 || UINT_MAX!=0xffffffff || MDB_SIZE_MAX%UINT_MAX # error "Two's complement, reasonably sized integer types, please" #endif @@ -309,6 +315,10 @@ union semun { # define MDB_USE_ROBUST 0 # else # define MDB_USE_ROBUST 1 +# endif +#endif /* !MDB_USE_ROBUST */ + +#if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) /* glibc < 2.12 only provided _np API */ # if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) @@ -316,10 +326,9 @@ union semun { # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) # endif -# endif -#endif /* MDB_USE_ROBUST */ +#endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ -#if defined(MDB_OWNERDEAD) && MDB_USE_ROBUST +#if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) #define MDB_ROBUST_SUPPORTED 1 #endif @@ -342,8 +351,10 @@ typedef HANDLE mdb_mutex_t, mdb_mutexref_t; #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) #define pthread_cond_signal(x) SetEvent(*x) #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) -#define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL) -#define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE) +#define THREAD_CREATE(thr,start,arg) \ + (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) +#define THREAD_FINISH(thr) \ + (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) #define mdb_mutex_consistent(mutex) 0 @@ -359,12 +370,10 @@ typedef HANDLE mdb_mutex_t, mdb_mutexref_t; #else #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 #endif -#define Z "I" #else #define THREAD_RET void * #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) #define THREAD_FINISH(thr) pthread_join(thr,NULL) -#define Z "z" /**< printf format modifier for size_t */ /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ #define MDB_PIDLOCK 1 @@ -418,15 +427,15 @@ mdb_sem_wait(mdb_mutexref_t sem) #define mdb_mutex_consistent(mutex) 0 #else /* MDB_USE_POSIX_MUTEX: */ - /** Shared mutex/semaphore as it is stored (mdb_mutex_t), and as - * local variables keep it (mdb_mutexref_t). + /** Shared mutex/semaphore as the original is stored. * - * An mdb_mutex_t can be assigned to an mdb_mutexref_t. They can - * be the same, or an array[size 1] and a pointer. - * @{ + * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. + * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it + * is array[size 1] so it can be assigned to the pointer. */ -typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; - /* @} */ +typedef pthread_mutex_t mdb_mutex_t[1]; + /** Reference to an #mdb_mutex_t */ +typedef pthread_mutex_t *mdb_mutexref_t; /** Lock the reader or writer mutex. * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). */ @@ -462,28 +471,23 @@ typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) #endif -#ifdef MDB_VL32 -#ifdef _WIN32 -#define Y "I64" -#else -#define Y "ll" -#endif -#else -#define Y Z -#endif +#define Z MDB_FMT_Z /**< printf/scanf format modifier for size_t */ +#define Yu MDB_PRIy(u) /**< printf format for #mdb_size_t */ +#define Yd MDB_PRIy(d) /**< printf format for 'signed #mdb_size_t' */ -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) -#define MNAME_LEN 32 -#elif defined(MDB_USE_SYSV_SEM) +#ifdef MDB_USE_SYSV_SEM #define MNAME_LEN (sizeof(int)) #else #define MNAME_LEN (sizeof(pthread_mutex_t)) #endif -#ifdef MDB_USE_SYSV_SEM -#define SYSV_SEM_FLAG 1 /**< SysV sems in lockfile format */ -#else -#define SYSV_SEM_FLAG 0 +/** Initial part of #MDB_env.me_mutexname[]. + * Changes to this code must be reflected in #MDB_LOCK_FORMAT. + */ +#ifdef _WIN32 +#define MUTEXNAME_PREFIX "Global\\MDB" +#elif defined MDB_USE_POSIX_SEM +#define MUTEXNAME_PREFIX "/MDB" #endif /** @} */ @@ -627,7 +631,11 @@ static txnid_t mdb_debug_start; /** The version number for a database's datafile format. */ #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) /** The version number for a database's lockfile format. */ -#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) +#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 2) + /** Number of bits representing #MDB_LOCK_VERSION in #MDB_LOCK_FORMAT. + * The remaining bits must leave room for #MDB_lock_desc. + */ +#define MDB_LOCK_VERSION_BITS 12 /** @brief The max size of a key we can write, or 0 for computed max. * @@ -693,12 +701,27 @@ static txnid_t mdb_debug_start; /** Round \b n up to an even number. */ #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + /** Least significant 1-bit of \b n. n must be of an unsigned type. */ +#define LOW_BIT(n) ((n) & (-(n))) + + /** (log2(\b p2) % \b n), for p2 = power of 2 and 0 < n < 8. */ +#define LOG2_MOD(p2, n) (7 - 86 / ((p2) % ((1U<<(n))-1) + 11)) + /* Explanation: Let p2 = 2**(n*y + x), x<n and M = (1U<<n)-1. Now p2 = + * (M+1)**y * 2**x = 2**x (mod M). Finally "/" "happens" to return 7-x. + */ + + /** Should be alignment of \b type. Ensure it is a power of 2. */ +#define ALIGNOF2(type) \ + LOW_BIT(offsetof(struct { char ch_; type align_; }, align_)) + /** Used for offsets within a single page. * Since memory pages are typically 4 or 8KB in size, 12-13 bits, * this is plenty. */ typedef uint16_t indx_t; +typedef unsigned long long mdb_hash_t; + /** Default size of memory map. * This is certainly too small for any actual applications. Apps should always set * the size explicitly using #mdb_env_set_mapsize(). @@ -827,7 +850,8 @@ typedef struct MDB_txbody { */ volatile unsigned mtb_numreaders; #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) - char mtb_rmname[MNAME_LEN]; + /** Binary form of names of the reader/writer locks */ + mdb_hash_t mtb_mutexid; #elif defined(MDB_USE_SYSV_SEM) int mtb_semid; int mtb_rlocked; @@ -846,20 +870,18 @@ typedef struct MDB_txninfo { #define mti_magic mt1.mtb.mtb_magic #define mti_format mt1.mtb.mtb_format #define mti_rmutex mt1.mtb.mtb_rmutex -#define mti_rmname mt1.mtb.mtb_rmname #define mti_txnid mt1.mtb.mtb_txnid #define mti_numreaders mt1.mtb.mtb_numreaders +#define mti_mutexid mt1.mtb.mtb_mutexid #ifdef MDB_USE_SYSV_SEM #define mti_semid mt1.mtb.mtb_semid #define mti_rlocked mt1.mtb.mtb_rlocked #endif char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; } mt1; +#if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) union { -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) - char mt2_wmname[MNAME_LEN]; -#define mti_wmname mt2.mt2_wmname -#elif defined MDB_USE_SYSV_SEM +#ifdef MDB_USE_SYSV_SEM int mt2_wlocked; #define mti_wlocked mt2.mt2_wlocked #else @@ -868,21 +890,79 @@ typedef struct MDB_txninfo { #endif char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; } mt2; +#endif MDB_reader mti_readers[1]; } MDB_txninfo; /** Lockfile format signature: version, features and field layout */ #define MDB_LOCK_FORMAT \ - ((uint32_t) \ - ((MDB_LOCK_VERSION) \ - /* Flags which describe functionality */ \ - + (SYSV_SEM_FLAG << 18) \ - + (((MDB_PIDLOCK) != 0) << 16))) + ((uint32_t) \ + (((MDB_LOCK_VERSION) % (1U << MDB_LOCK_VERSION_BITS)) \ + + MDB_lock_desc * (1U << MDB_LOCK_VERSION_BITS))) + + /** Lock type and layout. Values 0-119. _WIN32 implies #MDB_PIDLOCK. + * Some low values are reserved for future tweaks. + */ +#ifdef _WIN32 +# define MDB_LOCK_TYPE (0 + ALIGNOF2(mdb_hash_t)/8 % 2) +#elif defined MDB_USE_POSIX_SEM +# define MDB_LOCK_TYPE (4 + ALIGNOF2(mdb_hash_t)/8 % 2) +#elif defined MDB_USE_SYSV_SEM +# define MDB_LOCK_TYPE (8) +#elif defined MDB_USE_POSIX_MUTEX +/* We do not know the inside of a POSIX mutex and how to check if mutexes + * used by two executables are compatible. Just check alignment and size. + */ +# define MDB_LOCK_TYPE (10 + \ + LOG2_MOD(ALIGNOF2(pthread_mutex_t), 5) + \ + sizeof(pthread_mutex_t) / 4U % 22 * 5) +#endif + +enum { + /** Magic number for lockfile layout and features. + * + * This *attempts* to stop liblmdb variants compiled with conflicting + * options from using the lockfile at the same time and thus breaking + * it. It describes locking types, and sizes and sometimes alignment + * of the various lockfile items. + * + * The detected ranges are mostly guesswork, or based simply on how + * big they could be without using more bits. So we can tweak them + * in good conscience when updating #MDB_LOCK_VERSION. + */ + MDB_lock_desc = + /* Default CACHELINE=64 vs. other values (have seen mention of 32-256) */ + (CACHELINE==64 ? 0 : 1 + LOG2_MOD(CACHELINE >> (CACHELINE>64), 5)) + + 6 * (sizeof(MDB_PID_T)/4 % 3) /* legacy(2) to word(4/8)? */ + + 18 * (sizeof(pthread_t)/4 % 5) /* can be struct{id, active data} */ + + 90 * (sizeof(MDB_txbody) / CACHELINE % 3) + + 270 * (MDB_LOCK_TYPE % 120) + /* The above is < 270*120 < 2**15 */ + + ((sizeof(txnid_t) == 8) << 15) /* 32bit/64bit */ + + ((sizeof(MDB_reader) > CACHELINE) << 16) + /* Not really needed - implied by MDB_LOCK_TYPE != (_WIN32 locking) */ + + (((MDB_PIDLOCK) != 0) << 17) + /* 18 bits total: Must be <= (32 - MDB_LOCK_VERSION_BITS). */ +}; /** @} */ -/** Common header for all page types. - * Overflow records occupy a number of contiguous pages with no - * headers on any page after the first. +/** Common header for all page types. The page type depends on #mp_flags. + * + * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with + * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages + * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. + * + * #P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of #F_BIGDATA nodes. + * + * #P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDB_page { #define mp_pgno mp_p.p_pgno @@ -891,7 +971,7 @@ typedef struct MDB_page { pgno_t p_pgno; /**< page number */ struct MDB_page *p_next; /**< for in-memory list of freed pages */ } mp_p; - uint16_t mp_pad; + uint16_t mp_pad; /**< key size if this is a LEAF2 page */ /** @defgroup mdb_page Page Flags * @ingroup internal * Flags for the page headers. @@ -958,25 +1038,34 @@ typedef struct MDB_page { /** The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) - /** Link in #MDB_txn.%mt_loose_pgs list */ + /** Link in #MDB_txn.%mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. + */ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) /** Header for a single key/data pair within a page. * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. * We guarantee 2-byte alignment for 'MDB_node's. + * + * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. #F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just #F_SUBDATA). */ typedef struct MDB_node { - /** lo and hi are used for data size on leaf nodes and for - * child pgno on branch nodes. On 64 bit platforms, flags - * is also used for pgno. (Branch nodes have no flags). - * They are in host byte order in case that lets some - * accesses be optimized into a 32-bit word access. - */ + /** part of data size or pgno + * @{ */ #if BYTE_ORDER == LITTLE_ENDIAN - unsigned short mn_lo, mn_hi; /**< part of data size or pgno */ + unsigned short mn_lo, mn_hi; #else unsigned short mn_hi, mn_lo; #endif + /** @} */ /** @defgroup mdb_node Node Flags * @ingroup internal * Flags for node headers. @@ -1041,7 +1130,7 @@ typedef struct MDB_node { #ifdef MISALIGNED_OK #define COPY_PGNO(dst,src) dst = src #else -#if SIZE_MAX > 4294967295UL +#if MDB_SIZE_MAX > 0xffffffffU #define COPY_PGNO(dst,src) do { \ unsigned short *s, *d; \ s = (unsigned short *)&(src); \ @@ -1086,9 +1175,9 @@ typedef struct MDB_db { pgno_t md_root; /**< the root page of this tree */ } MDB_db; - /** mdb_dbi_open flags */ #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) + /** #mdb_dbi_open() flags */ #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) @@ -1121,13 +1210,16 @@ typedef struct MDB_meta { #else void *mm_address; /**< address for fixed mapping */ #endif - pgno_t mm_mapsize; /**< size of mmap region */ + mdb_size_t mm_mapsize; /**< size of mmap region */ MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ /** The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_pad /** Any persistent environment flags. @ref mdb_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags - pgno_t mm_last_pg; /**< last used page in file */ + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; volatile txnid_t mm_txnid; /**< txnid that committed this page */ } MDB_meta; @@ -1180,7 +1272,7 @@ struct MDB_txn { * in this transaction, linked through #NEXT_LOOSE_PAGE(page). */ MDB_page *mt_loose_pgs; - /* #Number of loose pages (#mt_loose_pgs) */ + /** Number of loose pages (#mt_loose_pgs) */ int mt_loose_count; /** The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are @@ -1203,11 +1295,12 @@ struct MDB_txn { * @ingroup internal * @{ */ -#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; @@ -1317,6 +1410,11 @@ struct MDB_cursor { indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ #ifdef MDB_VL32 MDB_page *mc_ovpg; /**< a referenced overflow page */ +# define MC_OVPG(mc) ((mc)->mc_ovpg) +# define MC_SET_OVPG(mc, pg) ((mc)->mc_ovpg = (pg)) +#else +# define MC_OVPG(mc) ((MDB_page *)0) +# define MC_SET_OVPG(mc, pg) ((void)0) #endif }; @@ -1336,6 +1434,23 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; + /** Check if there is an inited xcursor */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + + /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed + * when the node which contains the sub-page may have moved. Called + * with leaf page \b mp = mc->mc_pg[\b top]. + */ +#define XCURSOR_REFRESH(mc, top, mp) do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node; \ + if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ + xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ + if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ +} while (0) + /** State of FreeDB old pages, stored in the MDB_env */ typedef struct MDB_pgstate { pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ @@ -1346,7 +1461,7 @@ typedef struct MDB_pgstate { struct MDB_env { HANDLE me_fd; /**< The main data file */ HANDLE me_lfd; /**< The lock file */ - HANDLE me_mfd; /**< just for writing the meta pages */ + HANDLE me_mfd; /**< For writing and syncing the meta pages */ #if defined(MDB_VL32) && defined(_WIN32) HANDLE me_fmh; /**< File Mapping handle */ #endif @@ -1407,6 +1522,10 @@ struct MDB_env { #else mdb_mutex_t me_rmutex; mdb_mutex_t me_wmutex; +# if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + /** Half-initialized name of mutexes, to be completed by #MUTEXNAME() */ + char me_mutexname[sizeof(MUTEXNAME_PREFIX) + 11]; +# endif #endif #ifdef MDB_VL32 MDB_ID3L me_rpages; /**< like #mt_rpages, but global to env */ @@ -1522,19 +1641,25 @@ static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; /** @endcond */ -/** Compare two items pointing at size_t's of unknown alignment. */ +/** Compare two items pointing at '#mdb_size_t's of unknown alignment. */ #ifdef MISALIGNED_OK # define mdb_cmp_clong mdb_cmp_long #else # define mdb_cmp_clong mdb_cmp_cint #endif +/** True if we need #mdb_cmp_clong() instead of \b cmp for #MDB_INTEGERDUP */ +#define NEED_CMP_CLONG(cmp, ksize) \ + (UINT_MAX < MDB_SIZE_MAX && \ + (cmp) == mdb_cmp_int && (ksize) == sizeof(mdb_size_t)) + #ifdef _WIN32 static SECURITY_DESCRIPTOR mdb_null_sd; static SECURITY_ATTRIBUTES mdb_all_sa; static int mdb_sec_inited; -static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsize); +struct MDB_name; +static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); #endif /** Return the library version info. */ @@ -1569,6 +1694,7 @@ static char *const mdb_errstr[] = { "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", + "MDB_PROBLEM: Unexpected problem - txn should abort", }; char * @@ -1641,11 +1767,7 @@ mdb_assert_fail(MDB_env *env, const char *expr_txt, if (env->me_assert_func) env->me_assert_func(env, buf); fprintf(stderr, "%s\n", buf); -#ifdef NDEBUG - _exit(); -#else abort(); -#endif } #else # define mdb_assert0(env, expr, expr_txt) ((void) 0) @@ -1717,20 +1839,20 @@ mdb_page_list(MDB_page *mp) case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; case P_OVERFLOW: - fprintf(stderr, "Overflow page %"Y"u pages %u%s\n", + fprintf(stderr, "Overflow page %"Yu" pages %u%s\n", pgno, mp->mp_pages, state); return; case P_META: - fprintf(stderr, "Meta-page %"Y"u txnid %"Y"u\n", + fprintf(stderr, "Meta-page %"Yu" txnid %"Yu"\n", pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); return; default: - fprintf(stderr, "Bad page %"Y"u flags 0x%u\n", pgno, mp->mp_flags); + fprintf(stderr, "Bad page %"Yu" flags 0x%X\n", pgno, mp->mp_flags); return; } nkeys = NUMKEYS(mp); - fprintf(stderr, "%s %"Y"u numkeys %d%s\n", type, pgno, nkeys, state); + fprintf(stderr, "%s %"Yu" numkeys %d%s\n", type, pgno, nkeys, state); for (i=0; i<nkeys; i++) { if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ @@ -1745,7 +1867,7 @@ mdb_page_list(MDB_page *mp) key.mv_data = node->mn_data; nsize = NODESIZE + key.mv_size; if (IS_BRANCH(mp)) { - fprintf(stderr, "key %d: page %"Y"u, %s\n", i, NODEPGNO(node), + fprintf(stderr, "key %d: page %"Yu", %s\n", i, NODEPGNO(node), DKEY(&key)); total += nsize; } else { @@ -1780,7 +1902,7 @@ mdb_cursor_chk(MDB_cursor *mc) } if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) printf("ack!\n"); - if (mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + if (XCURSOR_INITED(mc)) { node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { @@ -1841,7 +1963,7 @@ static void mdb_audit(MDB_txn *txn) } } if (freecount + count + NUM_METAS != txn->mt_next_pgno) { - fprintf(stderr, "audit: %"Y"u freecount: %"Y"u count: %"Y"u total: %"Y"u next_pgno: %"Y"u\n", + fprintf(stderr, "audit: %"Yu" freecount: %"Yu" count: %"Yu" total: %"Yu" next_pgno: %"Yu"\n", txn->mt_txnid, freecount, count+NUM_METAS, freecount+count+NUM_METAS, txn->mt_next_pgno); } @@ -1858,15 +1980,14 @@ int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; -#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) - if (dcmp == mdb_cmp_int && a->mv_size == sizeof(mdb_size_t)) + if (NEED_CMP_CLONG(dcmp, a->mv_size)) dcmp = mdb_cmp_clong; -#endif return dcmp(a, b); } /** Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. */ static MDB_page * mdb_page_malloc(MDB_txn *txn, unsigned num) @@ -1978,8 +2099,14 @@ mdb_cursor_unref(MDB_cursor *mc) mc->mc_pg[0] = NULL; mc->mc_flags &= ~C_INITIALIZED; } +#define MDB_CURSOR_UNREF(mc, force) \ + (((force) || ((mc)->mc_flags & C_INITIALIZED)) \ + ? mdb_cursor_unref(mc) \ + : (void)0) + #else #define MDB_PAGE_UNREF(txn, mp) +#define MDB_CURSOR_UNREF(mc, force) ((void)0) #endif /* MDB_VL32 */ /** Loosen or free a single page. @@ -2011,7 +2138,7 @@ mdb_page_loose(MDB_cursor *mc, MDB_page *mp) if (mp != dl[x].mptr) { /* bad cursor? */ mc->mc_flags &= ~(C_INITIALIZED|C_EOF); txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; + return MDB_PROBLEM; } /* ok, it's ours */ loose = 1; @@ -2023,8 +2150,7 @@ mdb_page_loose(MDB_cursor *mc, MDB_page *mp) } } if (loose) { - DPRINTF(("loosen db %d page %"Y"u", DDBI(mc), - mp->mp_pgno)); + DPRINTF(("loosen db %d page %"Yu, DDBI(mc), mp->mp_pgno)); NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; txn->mt_loose_pgs = mp; txn->mt_loose_count++; @@ -2057,13 +2183,9 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) unsigned i, j; int rc = MDB_SUCCESS, level; - /* Mark pages seen by cursors */ - if (mc->mc_flags & C_UNTRACK) - mc = NULL; /* will find mc in mt_cursors */ - for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { - for (; mc; mc=mc->mc_next) { - if (!(mc->mc_flags & C_INITIALIZED)) - continue; + /* Mark pages seen by cursors: First m0, then tracked cursors */ + for (i = txn->mt_numdbs;; ) { + if (mc->mc_flags & C_INITIALIZED) { for (m3 = mc;; m3 = &mx->mx_cursor) { mp = NULL; for (j=0; j<m3->mc_snum; j++) { @@ -2082,10 +2204,13 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) break; } } - if (i == 0) - break; + mc = mc->mc_next; + for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) + if (i == 0) + goto mark_done; } +mark_done: if (all) { /* Mark dirty root pages */ for (i=0; i<txn->mt_numdbs; i++) { @@ -2275,7 +2400,7 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) } /** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. @@ -2321,8 +2446,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) np = txn->mt_loose_pgs; txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); txn->mt_loose_count--; - DPRINTF(("db %d use loose page %"Y"u", DDBI(mc), - np->mp_pgno)); + DPRINTF(("db %d use loose page %"Yu, DDBI(mc), np->mp_pgno)); *mp = np; return MDB_SUCCESS; } @@ -2408,7 +2532,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) np = m2.mc_pg[m2.mc_top]; leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) - return rc; + goto fail; idl = (MDB_ID *) data.mv_data; i = idl[0]; @@ -2424,10 +2548,10 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) } env->me_pglast = last; #if (MDB_DEBUG) > 1 - DPRINTF(("IDL read txn %"Y"u root %"Y"u num %u", + DPRINTF(("IDL read txn %"Yu" root %"Yu" num %u", last, txn->mt_dbs[FREE_DBI].md_root, i)); for (j = i; j; j--) - DPRINTF(("IDL %"Y"u", idl[j])); + DPRINTF(("IDL %"Yu, idl[j])); #endif /* Merge in descending sorted order */ mdb_midl_xmerge(mop, idl); @@ -2572,6 +2696,7 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) } /** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc cursor pointing to the page to be touched * @return 0 on success, non-zero on failure. */ @@ -2597,7 +2722,7 @@ mdb_page_touch(MDB_cursor *mc) (rc = mdb_page_alloc(mc, 1, &np))) goto fail; pgno = np->mp_pgno; - DPRINTF(("touched db %d page %"Y"u -> %"Y"u", DDBI(mc), + DPRINTF(("touched db %d page %"Yu" -> %"Yu, DDBI(mc), mp->mp_pgno, pgno)); mdb_cassert(mc, mp->mp_pgno != pgno); mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); @@ -2621,7 +2746,7 @@ mdb_page_touch(MDB_cursor *mc) if (mp != dl[x].mptr) { /* bad cursor? */ mc->mc_flags &= ~(C_INITIALIZED|C_EOF); txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; + return MDB_PROBLEM; } return 0; } @@ -2660,14 +2785,8 @@ done: if (m2 == mc) continue; if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - IS_LEAF(np) && - (m2->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - { - MDB_node *leaf = NODEPTR(np, m2->mc_ki[mc->mc_top]); - if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } + if (IS_LEAF(np)) + XCURSOR_REFRESH(m2, mc->mc_top, np); } } } @@ -2998,7 +3117,7 @@ mdb_txn_renew(MDB_txn *txn) rc = mdb_txn_renew0(txn); if (rc == MDB_SUCCESS) { - DPRINTF(("renew txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", + DPRINTF(("renew txn %"Yu"%c %p on mdbenv %p, root page %"Yu, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); } @@ -3116,7 +3235,7 @@ renew: } else { txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ *ret = txn; - DPRINTF(("begin txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", + DPRINTF(("begin txn %"Yu"%c %p on mdbenv %p, root page %"Yu, txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); } @@ -3183,7 +3302,7 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) /* Export or close DBI handles opened in this txn */ mdb_dbis_update(txn, mode & MDB_END_UPDATE); - DPRINTF(("%s txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", + DPRINTF(("%s txn %"Yu"%c %p on mdbenv %p, root page %"Yu, names[mode & MDB_END_OPMASK], txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); @@ -3325,10 +3444,41 @@ mdb_freelist_save(MDB_txn *txn) * we may be unable to return them to me_pghead. */ MDB_page *mp = txn->mt_loose_pgs; + MDB_ID2 *dl = txn->mt_u.dirty_list; + unsigned x; if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) return rc; - for (; mp; mp = NEXT_LOOSE_PAGE(mp)) + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* must also remove from dirty list */ + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + for (x=1; x<=dl[0].mid; x++) + if (dl[x].mid == mp->mp_pgno) + break; + mdb_tassert(txn, x <= dl[0].mid); + } else { + x = mdb_mid2l_search(dl, mp->mp_pgno); + mdb_tassert(txn, dl[x].mid == mp->mp_pgno); + } + dl[x].mptr = NULL; + mdb_dpage_free(env, mp); + } + { + /* squash freed slots out of the dirty list */ + unsigned y; + for (y=1; dl[y].mptr && y <= dl[0].mid; y++); + if (y <= dl[0].mid) { + for(x=y, y++;;) { + while (!dl[y].mptr && y <= dl[0].mid) y++; + if (y > dl[0].mid) break; + dl[x++] = dl[y++]; + } + dl[0].mid = x-1; + } else { + /* all slots freed */ + dl[0].mid = 0; + } + } txn->mt_loose_pgs = NULL; txn->mt_loose_count = 0; } @@ -3384,10 +3534,10 @@ mdb_freelist_save(MDB_txn *txn) #if (MDB_DEBUG) > 1 { unsigned int i = free_pgs[0]; - DPRINTF(("IDL write txn %"Y"u root %"Y"u num %u", + DPRINTF(("IDL write txn %"Yu" root %"Yu" num %u", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); for (; i; i--) - DPRINTF(("IDL %"Y"u", free_pgs[i])); + DPRINTF(("IDL %"Yu, free_pgs[i])); } #endif continue; @@ -3555,7 +3705,7 @@ mdb_page_flush(MDB_txn *txn, int keep) * the write offset, to at least save the overhead of a Seek * system call. */ - DPRINTF(("committing page %"Z"u", pgno)); + DPRINTF(("committing page %"Yu, pgno)); memset(&ov, 0, sizeof(ov)); ov.Offset = pos & 0xffffffff; ov.OffsetHigh = pos >> 16 >> 16; @@ -3606,7 +3756,7 @@ retry_seek: wpos = pos; wsize = 0; } - DPRINTF(("committing page %"Y"u", pgno)); + DPRINTF(("committing page %"Yu, pgno)); next_pos = pos + size; iov[n].iov_len = size; iov[n].iov_base = (char *)dp; @@ -3821,7 +3971,7 @@ mdb_txn_commit(MDB_txn *txn) !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) goto done; - DPRINTF(("committing txn %"Y"u %p on mdbenv %p, root page %"Y"u", + DPRINTF(("committing txn %"Yu" %p on mdbenv %p, root page %"Yu, txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); /* Update DB root pointers */ @@ -3929,7 +4079,7 @@ mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta) p = (MDB_page *)&pbuf; if (!F_ISSET(p->mp_flags, P_META)) { - DPRINTF(("page %"Y"u not a meta page", p->mp_pgno)); + DPRINTF(("page %"Yu" not a meta page", p->mp_pgno)); return MDB_INVALID; } @@ -4041,7 +4191,7 @@ mdb_env_write_meta(MDB_txn *txn) #endif toggle = txn->mt_txnid & 1; - DPRINTF(("writing meta page %d for root page %"Y"u", + DPRINTF(("writing meta page %d for root page %"Yu, toggle, txn->mt_dbs[MAIN_DBI].md_root)); env = txn->mt_env; @@ -4093,7 +4243,10 @@ mdb_env_write_meta(MDB_txn *txn) len = sizeof(MDB_meta) - off; off += (char *)mp - env->me_map; - /* Write to the SYNC fd */ + /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. + * (me_mfd goes to the same file as me_fd, but writing to it + * also syncs to disk. Avoids a separate fdatasync() call.) + */ mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; #ifdef _WIN32 { @@ -4382,6 +4535,189 @@ mdb_fsize(HANDLE fd, mdb_size_t *size) return MDB_SUCCESS; } + +#ifdef _WIN32 +typedef wchar_t mdb_nchar_t; +# define MDB_NAME(str) L##str +# define mdb_name_cpy wcscpy +#else +/** Character type for file names: char on Unix, wchar_t on Windows */ +typedef char mdb_nchar_t; +# define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ +# define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ +#endif + +/** Filename - string of #mdb_nchar_t[] */ +typedef struct MDB_name { + int mn_len; /**< Length */ + int mn_alloced; /**< True if #mn_val was malloced */ + mdb_nchar_t *mn_val; /**< Contents */ +} MDB_name; + +/** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ +static const mdb_nchar_t *const mdb_suffixes[2][2] = { + { MDB_NAME("/data.mdb"), MDB_NAME("") }, + { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } +}; + +#define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ + +/** Set up filename + scratch area for filename suffix, for opening files. + * It should be freed with #mdb_fname_destroy(). + * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. + * + * @param[in] path Pathname for #mdb_env_open(). + * @param[in] envflags Whether a subdir and/or lockfile will be used. + * @param[out] fname Resulting filename, with room for a suffix if necessary. + */ +static int ESECT +mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) +{ + int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); + fname->mn_alloced = 0; +#ifdef _WIN32 + return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); +#else + fname->mn_len = strlen(path); + if (no_suffix) + fname->mn_val = (char *) path; + else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { + fname->mn_alloced = 1; + strcpy(fname->mn_val, path); + } + else + return ENOMEM; + return MDB_SUCCESS; +#endif +} + +/** Destroy \b fname from #mdb_fname_init() */ +#define mdb_fname_destroy(fname) \ + do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) + +#ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ +# define MDB_CLOEXEC O_CLOEXEC +#else +# define MDB_CLOEXEC 0 +#endif + +/** File type, access mode etc. for #mdb_fopen() */ +enum mdb_fopen_type { +#ifdef _WIN32 + MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS +#else + /* A comment in mdb_fopen() explains some O_* flag choices. */ + MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ + MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ + MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ + MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ + /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits + * distinguish otherwise-equal MDB_O_* constants from each other. + */ + MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, + MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ +#endif +}; + +/** Open an LMDB file. + * @param[in] env The LMDB environment. + * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is + * appended if necessary to create the filename, without changing mn_len. + * @param[in] which Determines file type, access mode, etc. + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[out] res Resulting file handle. + * @return 0 on success, non-zero on failure. + */ +static int ESECT +mdb_fopen(const MDB_env *env, MDB_name *fname, + enum mdb_fopen_type which, mdb_mode_t mode, + HANDLE *res) +{ + int rc = MDB_SUCCESS; + HANDLE fd; +#ifdef _WIN32 + DWORD acc, share, disp, attrs; +#else + int flags; +#endif + + if (fname->mn_alloced) /* modifiable copy */ + mdb_name_cpy(fname->mn_val + fname->mn_len, + mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); + + /* The directory must already exist. Usually the file need not. + * MDB_O_META requires the file because we already created it using + * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. + * + * With MDB_O_COPY we do not want the OS to cache the writes, since + * the source data is already in the OS cache. + * + * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) + * to avoid the flock() issues noted under Caveats in lmdb.h. + * Also set it for other filehandles which the user cannot get at + * and close himself, which he may need after fork(). I.e. all but + * me_fd, which programs do use via mdb_env_get_fd(). + */ + +#ifdef _WIN32 + acc = GENERIC_READ|GENERIC_WRITE; + share = FILE_SHARE_READ|FILE_SHARE_WRITE; + disp = OPEN_ALWAYS; + attrs = FILE_ATTRIBUTE_NORMAL; + switch (which) { + case MDB_O_RDONLY: /* read-only datafile */ + acc = GENERIC_READ; + disp = OPEN_EXISTING; + break; + case MDB_O_META: /* for writing metapages */ + acc = GENERIC_WRITE; + disp = OPEN_EXISTING; + attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; + break; + case MDB_O_COPY: /* mdb_env_copy() & co */ + acc = GENERIC_WRITE; + share = 0; + disp = CREATE_NEW; + attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; + break; + default: break; /* silence gcc -Wswitch (not all enum values handled) */ + } + fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); +#else + fd = open(fname->mn_val, which & MDB_O_MASK, mode); +#endif + + if (fd == INVALID_HANDLE_VALUE) + rc = ErrCode(); +#ifndef _WIN32 + else { + if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { + /* Set CLOEXEC if we could not pass it to open() */ + if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) + (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); + } + if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { + /* This may require buffer alignment. There is no portable + * way to ask how much, so we require OS pagesize alignment. + */ +# ifdef F_NOCACHE /* __APPLE__ */ + (void) fcntl(fd, F_NOCACHE, 1); +# elif defined O_DIRECT + /* open(...O_DIRECT...) would break on filesystems without + * O_DIRECT support (ITS#7682). Try to set it here instead. + */ + if ((flags = fcntl(fd, F_GETFL)) != -1) + (void) fcntl(fd, F_SETFL, flags | O_DIRECT); +# endif + } + } +#endif /* !_WIN32 */ + + *res = fd; + return rc; +} + + #ifdef BROKEN_FDATASYNC #include <sys/utsname.h> #include <sys/vfs.h> @@ -4407,16 +4743,16 @@ mdb_env_open2(MDB_env *env, int prev) if (!NtCreateSection) { HMODULE h = GetModuleHandle("NTDLL.DLL"); if (!h) - return MDB_PANIC; + return MDB_PROBLEM; NtClose = (NtCloseFunc *)GetProcAddress(h, "NtClose"); if (!NtClose) - return MDB_PANIC; + return MDB_PROBLEM; NtMapViewOfSection = (NtMapViewOfSectionFunc *)GetProcAddress(h, "NtMapViewOfSection"); if (!NtMapViewOfSection) - return MDB_PANIC; + return MDB_PROBLEM; NtCreateSection = (NtCreateSectionFunc *)GetProcAddress(h, "NtCreateSection"); if (!NtCreateSection) - return MDB_PANIC; + return MDB_PROBLEM; } #endif /* _WIN32 */ @@ -4554,13 +4890,13 @@ mdb_env_open2(MDB_env *env, int prev) DPRINTF(("opened database version %u, pagesize %u", meta->mm_version, env->me_psize)); - DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); - DPRINTF(("depth: %u", db->md_depth)); - DPRINTF(("entries: %"Y"u", db->md_entries)); - DPRINTF(("branch pages: %"Y"u", db->md_branch_pages)); - DPRINTF(("leaf pages: %"Y"u", db->md_leaf_pages)); - DPRINTF(("overflow pages: %"Y"u", db->md_overflow_pages)); - DPRINTF(("root: %"Y"u", db->md_root)); + DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); + DPRINTF(("depth: %u", db->md_depth)); + DPRINTF(("entries: %"Yu, db->md_entries)); + DPRINTF(("branch pages: %"Yu, db->md_branch_pages)); + DPRINTF(("leaf pages: %"Yu, db->md_leaf_pages)); + DPRINTF(("overflow pages: %"Yu, db->md_overflow_pages)); + DPRINTF(("root: %"Yu, db->md_root)); } #endif @@ -4577,7 +4913,11 @@ mdb_env_reader_dest(void *ptr) { MDB_reader *reader = ptr; - reader->mr_pid = 0; +#ifndef _WIN32 + if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ +#endif + /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ + reader->mr_pid = 0; } #ifdef _WIN32 @@ -4749,32 +5089,21 @@ mdb_env_excl_lock(MDB_env *env, int *excl) * Share and Enjoy! :-) */ -typedef unsigned long long mdb_hash_t; -#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) - /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer * @param[in] val value to hash - * @param[in] hval initial value for hash + * @param[in] len length of value * @return 64 bit hash - * - * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the - * hval arg on the first call. */ static mdb_hash_t -mdb_hash_val(MDB_val *val, mdb_hash_t hval) +mdb_hash(const void *val, size_t len) { - unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ - unsigned char *end = s + val->mv_size; + const unsigned char *s = (const unsigned char *) val, *end = s + len; + mdb_hash_t hval = 0xcbf29ce484222325ULL; /* - * FNV-1a hash each octet of the string + * FNV-1a hash each octet of the buffer */ while (s < end) { - /* xor the bottom with the current octet */ - hval ^= (mdb_hash_t)*s++; - - /* multiply by the 64 bit FNV magic prime mod 2^64 */ - hval += (hval << 1) + (hval << 4) + (hval << 5) + - (hval << 7) + (hval << 8) + (hval << 40); + hval = (hval ^ *s++) * 0x100000001b3ULL; } /* return our new hash value */ return hval; @@ -4791,47 +5120,49 @@ mdb_hash_val(MDB_val *val, mdb_hash_t hval) static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; static void ESECT -mdb_pack85(unsigned long l, char *out) +mdb_pack85(unsigned long long l, char *out) { int i; - for (i=0; i<5; i++) { + for (i=0; i<10 && l; i++) { *out++ = mdb_a85[l % 85]; l /= 85; } + *out = '\0'; } +/** Init #MDB_env.me_mutexname[] except the char which #MUTEXNAME() will set. + * Changes to this code must be reflected in #MDB_LOCK_FORMAT. + */ static void ESECT -mdb_hash_enc(MDB_val *val, char *encbuf) +mdb_env_mname_init(MDB_env *env) { - mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); - - mdb_pack85(h, encbuf); - mdb_pack85(h>>32, encbuf+5); - encbuf[10] = '\0'; + char *nm = env->me_mutexname; + strcpy(nm, MUTEXNAME_PREFIX); + mdb_pack85(env->me_txns->mti_mutexid, nm + sizeof(MUTEXNAME_PREFIX)); } + +/** Return env->me_mutexname after filling in ch ('r'/'w') for convenience */ +#define MUTEXNAME(env, ch) ( \ + (void) ((env)->me_mutexname[sizeof(MUTEXNAME_PREFIX)-1] = (ch)), \ + (env)->me_mutexname) + #endif /** Open and/or initialize the lock region for the environment. * @param[in] env The LMDB environment. - * @param[in] lpath The pathname of the file used for the lock region. + * @param[in] fname Filename + scratch area, from #mdb_fname_init(). * @param[in] mode The Unix permissions for the file, if we create it. * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive * @return 0 on success, non-zero on failure. */ static int ESECT -mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) +mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) { #ifdef _WIN32 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT #else # define MDB_ERRCODE_ROFS EROFS -#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ -# define MDB_CLOEXEC O_CLOEXEC -#else - int fdflags; -# define MDB_CLOEXEC 0 -#endif #endif #ifdef MDB_USE_SYSV_SEM int semid; @@ -4840,30 +5171,14 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) int rc; off_t size, rsize; -#ifdef _WIN32 - wchar_t *wlpath; - rc = utf8_to_utf16(lpath, -1, &wlpath, NULL); - if (rc) - return rc; - env->me_lfd = CreateFileW(wlpath, GENERIC_READ|GENERIC_WRITE, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL); - free(wlpath); -#else - env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); -#endif - if (env->me_lfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); + rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); + if (rc) { + /* Omit lockfile if read-only env on read-only filesystem */ if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { return MDB_SUCCESS; } - goto fail_errno; + goto fail; } -#if ! ((MDB_CLOEXEC) || defined(_WIN32)) - /* Lose record locks when exec*() */ - if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_lfd, F_SETFD, fdflags); -#endif if (!(env->me_flags & MDB_NOTLS)) { rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); @@ -4929,8 +5244,6 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) DWORD nhigh; DWORD nlow; } idbuf; - MDB_val val; - char encbuf[11]; if (!mdb_sec_inited) { InitializeSecurityDescriptor(&mdb_null_sd, @@ -4945,14 +5258,11 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) idbuf.volume = stbuf.dwVolumeSerialNumber; idbuf.nhigh = stbuf.nFileIndexHigh; idbuf.nlow = stbuf.nFileIndexLow; - val.mv_data = &idbuf; - val.mv_size = sizeof(idbuf); - mdb_hash_enc(&val, encbuf); - sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); - sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); - env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); + env->me_txns->mti_mutexid = mdb_hash(&idbuf, sizeof(idbuf)); + mdb_env_mname_init(env); + env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, MUTEXNAME(env, 'r')); if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); + env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, MUTEXNAME(env, 'w')); if (!env->me_wmutex) goto fail_errno; #elif defined(MDB_USE_POSIX_SEM) struct stat stbuf; @@ -4960,37 +5270,35 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) dev_t dev; ino_t ino; } idbuf; - MDB_val val; - char encbuf[11]; #if defined(__NetBSD__) #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ #endif if (fstat(env->me_lfd, &stbuf)) goto fail_errno; + memset(&idbuf, 0, sizeof(idbuf)); idbuf.dev = stbuf.st_dev; idbuf.ino = stbuf.st_ino; - val.mv_data = &idbuf; - val.mv_size = sizeof(idbuf); - mdb_hash_enc(&val, encbuf); + env->me_txns->mti_mutexid = mdb_hash(&idbuf, sizeof(idbuf)) #ifdef MDB_SHORT_SEMNAMES - encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ + /* Max 9 base85-digits. We truncate here instead of in + * mdb_env_mname_init() to keep the latter portable. + */ + % ((mdb_hash_t)85*85*85*85*85*85*85*85*85) #endif - sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); - sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); + ; + mdb_env_mname_init(env); /* Clean up after a previous run, if needed: Try to * remove both semaphores before doing anything else. */ - sem_unlink(env->me_txns->mti_rmname); - sem_unlink(env->me_txns->mti_wmname); - env->me_rmutex = sem_open(env->me_txns->mti_rmname, - O_CREAT|O_EXCL, mode, 1); + sem_unlink(MUTEXNAME(env, 'r')); + sem_unlink(MUTEXNAME(env, 'w')); + env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), O_CREAT|O_EXCL, mode, 1); if (env->me_rmutex == SEM_FAILED) goto fail_errno; - env->me_wmutex = sem_open(env->me_txns->mti_wmname, - O_CREAT|O_EXCL, mode, 1); + env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), O_CREAT|O_EXCL, mode, 1); if (env->me_wmutex == SEM_FAILED) goto fail_errno; #elif defined(MDB_USE_SYSV_SEM) unsigned short vals[2] = {1, 1}; - key_t key = ftok(lpath, 'M'); + key_t key = ftok(fname->mn_val, 'M'); /* fname is lockfile path now */ if (key == -1) goto fail_errno; semid = semget(key, 2, (mode & 0777) | IPC_CREAT); @@ -5000,6 +5308,8 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) if (semctl(semid, 0, SETALL, semu) < 0) goto fail_errno; env->me_txns->mti_semid = semid; + env->me_txns->mti_rlocked = 0; + env->me_txns->mti_wlocked = 0; #else /* MDB_USE_POSIX_MUTEX: */ pthread_mutexattr_t mattr; @@ -5010,15 +5320,17 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); - if ((rc = pthread_mutexattr_init(&mattr)) - || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) + if ((rc = pthread_mutexattr_init(&mattr)) != 0) + goto fail; + rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); #ifdef MDB_ROBUST_SUPPORTED - || (rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST)) + if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); #endif - || (rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr)) - || (rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr))) - goto fail; + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); + if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); pthread_mutexattr_destroy(&mattr); + if (rc) + goto fail; #endif /* _WIN32 || ... */ env->me_txns->mti_magic = MDB_MAGIC; @@ -5046,14 +5358,16 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) goto fail; } #ifdef _WIN32 - env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); + mdb_env_mname_init(env); + env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'r')); if (!env->me_rmutex) goto fail_errno; - env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); + env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'w')); if (!env->me_wmutex) goto fail_errno; #elif defined(MDB_USE_POSIX_SEM) - env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); + mdb_env_mname_init(env); + env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), 0); if (env->me_rmutex == SEM_FAILED) goto fail_errno; - env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); + env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), 0); if (env->me_wmutex == SEM_FAILED) goto fail_errno; #elif defined(MDB_USE_SYSV_SEM) semid = env->me_txns->mti_semid; @@ -5074,13 +5388,6 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) env->me_rmutex->locked = &env->me_txns->mti_rlocked; env->me_wmutex->locked = &env->me_txns->mti_wlocked; #endif -#ifdef MDB_VL32 -#ifdef _WIN32 - env->me_rpmutex = CreateMutex(NULL, FALSE, NULL); -#else - pthread_mutex_init(&env->me_rpmutex, NULL); -#endif -#endif return MDB_SUCCESS; @@ -5090,12 +5397,6 @@ fail: return rc; } - /** The name of the lock file in the DB environment */ -#define LOCKNAME "/lock.mdb" - /** The name of the data file in the DB environment */ -#define DATANAME "/data.mdb" - /** The suffix of the lock file when no subdir is used */ -#define LOCKSUFF "-lock" /** Only a subset of the @ref mdb_env flags can be changed * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. @@ -5111,11 +5412,8 @@ fail: int ESECT mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) { - int oflags, rc, len, excl = -1; - char *lpath, *dpath; -#ifdef _WIN32 - wchar_t *wpath; -#endif + int rc, excl = -1; + MDB_name fname; if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) return EINVAL; @@ -5130,28 +5428,27 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode return EINVAL; } #endif + flags |= env->me_flags; - len = strlen(path); - if (flags & MDB_NOSUBDIR) { - rc = len + sizeof(LOCKSUFF) + len + 1; - } else { - rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); - } - lpath = malloc(rc); - if (!lpath) - return ENOMEM; - if (flags & MDB_NOSUBDIR) { - dpath = lpath + len + sizeof(LOCKSUFF); - sprintf(lpath, "%s" LOCKSUFF, path); - strcpy(dpath, path); - } else { - dpath = lpath + len + sizeof(LOCKNAME); - sprintf(lpath, "%s" LOCKNAME, path); - sprintf(dpath, "%s" DATANAME, path); + rc = mdb_fname_init(path, flags, &fname); + if (rc) + return rc; + +#ifdef MDB_VL32 +#ifdef _WIN32 + env->me_rpmutex = CreateMutex(NULL, FALSE, NULL); + if (!env->me_rpmutex) { + rc = ErrCode(); + goto leave; } +#else + rc = pthread_mutex_init(&env->me_rpmutex, NULL); + if (rc) + goto leave; +#endif +#endif + flags |= MDB_ENV_ACTIVE; /* tell mdb_env_close0() to clean up */ - rc = MDB_SUCCESS; - flags |= env->me_flags; if (flags & MDB_RDONLY) { /* silently ignore WRITEMAP when we're only getting read access */ flags &= ~MDB_WRITEMAP; @@ -5160,8 +5457,13 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) rc = ENOMEM; } + + env->me_flags = flags; + if (rc) + goto leave; + #ifdef MDB_VL32 - if (!rc) { + { env->me_rpages = malloc(MDB_ERPAGE_SIZE * sizeof(MDB_ID3)); if (!env->me_rpages) { rc = ENOMEM; @@ -5171,9 +5473,6 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode env->me_rpcheck = MDB_ERPAGE_SIZE/2; } #endif - env->me_flags = flags |= MDB_ENV_ACTIVE; - if (rc) - goto leave; env->me_path = strdup(path); env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); @@ -5187,73 +5486,35 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode /* For RDONLY, get lockfile after we know datafile exists */ if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); + rc = mdb_env_setup_locks(env, &fname, mode, &excl); if (rc) goto leave; + if ((flags & MDB_PREVSNAPSHOT) && !excl) { + rc = EAGAIN; + goto leave; + } } -#ifdef _WIN32 - if (F_ISSET(flags, MDB_RDONLY)) { - oflags = GENERIC_READ; - len = OPEN_EXISTING; - } else { - oflags = GENERIC_READ|GENERIC_WRITE; - len = OPEN_ALWAYS; - } - mode = FILE_ATTRIBUTE_NORMAL; - rc = utf8_to_utf16(dpath, -1, &wpath, NULL); + rc = mdb_fopen(env, &fname, + (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, + mode, &env->me_fd); if (rc) goto leave; - env->me_fd = CreateFileW(wpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, - NULL, len, mode, NULL); - free(wpath); -#else - if (F_ISSET(flags, MDB_RDONLY)) - oflags = O_RDONLY; - else - oflags = O_RDWR | O_CREAT; - - env->me_fd = open(dpath, oflags, mode); -#endif - if (env->me_fd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); + rc = mdb_env_setup_locks(env, &fname, mode, &excl); if (rc) goto leave; - if ((flags & MDB_PREVSNAPSHOT) && !excl) { - rc = EAGAIN; - goto leave; - } } if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) { - if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { - env->me_mfd = env->me_fd; - } else { + if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { /* Synchronous fd for meta writes. Needed even with * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. */ -#ifdef _WIN32 - len = OPEN_EXISTING; - rc = utf8_to_utf16(dpath, -1, &wpath, NULL); + rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); if (rc) goto leave; - env->me_mfd = CreateFileW(wpath, oflags, - FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, - mode | FILE_FLAG_WRITE_THROUGH, NULL); - free(wpath); -#else - oflags &= ~O_CREAT; - env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); -#endif - if (env->me_mfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } } DPRINTF(("opened dbenv %p", (void *) env)); if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { @@ -5296,7 +5557,7 @@ leave: if (rc) { mdb_env_close0(env, excl); } - free(lpath); + mdb_fname_destroy(fname); return rc; } @@ -5324,11 +5585,13 @@ mdb_env_close0(MDB_env *env, int excl) #ifdef MDB_VL32 if (env->me_txn0 && env->me_txn0->mt_rpages) free(env->me_txn0->mt_rpages); - { unsigned int x; - for (x=1; x<=env->me_rpages[0].mid; x++) - munmap(env->me_rpages[x].mptr, env->me_rpages[x].mcnt * env->me_psize); + if (env->me_rpages) { + MDB_ID3L el = env->me_rpages; + unsigned int x; + for (x=1; x<=el[0].mid; x++) + munmap(el[x].mptr, el[x].mcnt * env->me_psize); + free(el); } - free(env->me_rpages); #endif free(env->me_txn0); mdb_midl_free(env->me_free_pgs); @@ -5353,12 +5616,12 @@ mdb_env_close0(MDB_env *env, int excl) munmap(env->me_map, env->me_mapsize); #endif } - if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) + if (env->me_mfd != INVALID_HANDLE_VALUE) (void) close(env->me_mfd); if (env->me_fd != INVALID_HANDLE_VALUE) (void) close(env->me_fd); if (env->me_txns) { - MDB_PID_T pid = env->me_pid; + MDB_PID_T pid = getpid(); /* Clearing readers is done in this function because * me_txkey with its destructor must be disabled first. * @@ -5388,8 +5651,8 @@ mdb_env_close0(MDB_env *env, int excl) if (excl == 0) mdb_env_excl_lock(env, &excl); if (excl > 0) { - sem_unlink(env->me_txns->mti_rmname); - sem_unlink(env->me_txns->mti_wmname); + sem_unlink(MUTEXNAME(env, 'r')); + sem_unlink(MUTEXNAME(env, 'w')); } } #elif defined(MDB_USE_SYSV_SEM) @@ -5447,7 +5710,7 @@ mdb_env_close(MDB_env *env) free(env); } -/** Compare two items pointing at aligned mdb_size_t's */ +/** Compare two items pointing at aligned #mdb_size_t's */ static int mdb_cmp_long(const MDB_val *a, const MDB_val *b) { @@ -5458,7 +5721,7 @@ mdb_cmp_long(const MDB_val *a, const MDB_val *b) /** Compare two items pointing at aligned unsigned int's. * * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, - * but #mdb_cmp_clong() is called instead if the data type is mdb_size_t. + * but #mdb_cmp_clong() is called instead if the data type is #mdb_size_t. */ static int mdb_cmp_int(const MDB_val *a, const MDB_val *b) @@ -5563,7 +5826,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) nkeys = NUMKEYS(mp); - DPRINTF(("searching %u keys in %s %spage %"Y"u", + DPRINTF(("searching %u keys in %s %spage %"Yu, nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdb_dbg_pgno(mp))); @@ -5611,7 +5874,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) DPRINTF(("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc)); else - DPRINTF(("found branch index %u [%s -> %"Y"u], rc = %i", + DPRINTF(("found branch index %u [%s -> %"Yu"], rc = %i", i, DKEY(&nodekey), NODEPGNO(node), rc)); #endif if (rc == 0) @@ -5659,7 +5922,7 @@ static void mdb_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { - DPRINTF(("popping page %"Y"u off db %d cursor %p", + DPRINTF(("popping page %"Yu" off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); mc->mc_snum--; @@ -5671,11 +5934,13 @@ mdb_cursor_pop(MDB_cursor *mc) } } -/** Push a page onto the top of the cursor's stack. */ +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) { - DPRINTF(("pushing page %"Y"u on db %d cursor %p", mp->mp_pgno, + DPRINTF(("pushing page %"Yu" on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *) mc)); if (mc->mc_snum >= CURSOR_STACK) { @@ -5979,6 +6244,7 @@ ok: #endif /** Find the address of the page corresponding to a given page number. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc the cursor accessing the page. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. @@ -5989,9 +6255,6 @@ static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) { MDB_txn *txn = mc->mc_txn; -#ifndef MDB_VL32 - MDB_env *env = txn->mt_env; -#endif MDB_page *p = NULL; int level; @@ -6010,14 +6273,7 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) MDB_ID pn = pgno << 1; x = mdb_midl_search(tx2->mt_spill_pgs, pn); if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { -#ifdef MDB_VL32 - int rc = mdb_rpage_get(txn, pgno, &p); - if (rc) - return rc; -#else - p = (MDB_page *)(env->me_map + env->me_psize * pgno); -#endif - goto done; + goto mapped; } } if (dl[0].mid) { @@ -6031,21 +6287,26 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) } while ((tx2 = tx2->mt_parent) != NULL); } - if (pgno < txn->mt_next_pgno) { - level = 0; + if (pgno >= txn->mt_next_pgno) { + DPRINTF(("page %"Yu" not found", pgno)); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; + } + + level = 0; + +mapped: + { #ifdef MDB_VL32 - { - int rc = mdb_rpage_get(txn, pgno, &p); - if (rc) - return rc; + int rc = mdb_rpage_get(txn, pgno, &p); + if (rc) { + txn->mt_flags |= MDB_TXN_ERROR; + return rc; } #else + MDB_env *env = txn->mt_env; p = (MDB_page *)(env->me_map + env->me_psize * pgno); #endif - } else { - DPRINTF(("page %"Y"u not found", pgno)); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_NOTFOUND; } done: @@ -6069,18 +6330,27 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) MDB_node *node; indx_t i; - DPRINTF(("branch page %"Y"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + DPRINTF(("branch page %"Yu" has %u keys", mp->mp_pgno, NUMKEYS(mp))); /* Don't assert on branch pages in the FreeDB. We can get here * while in the process of rebalancing a FreeDB branch page; we must * let that proceed. ITS#8336 */ mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); - DPRINTF(("found index 0 to page %"Y"u", NODEPGNO(NODEPTR(mp, 0)))); + DPRINTF(("found index 0 to page %"Yu, NODEPGNO(NODEPTR(mp, 0)))); if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { i = 0; - if (flags & MDB_PS_LAST) + if (flags & MDB_PS_LAST) { i = NUMKEYS(mp) - 1; + /* if already init'd, see if we're already in right place */ + if (mc->mc_flags & C_INITIALIZED) { + if (mc->mc_ki[mc->mc_top] == i) { + mc->mc_top = mc->mc_snum++; + mp = mc->mc_pg[mc->mc_top]; + goto ready; + } + } + } } else { int exact; node = mdb_node_search(mc, key, &exact); @@ -6106,6 +6376,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) if ((rc = mdb_cursor_push(mc, mp))) return rc; +ready: if (flags & MDB_PS_MODIFY) { if ((rc = mdb_page_touch(mc)) != 0) return rc; @@ -6120,7 +6391,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) return MDB_CORRUPTED; } - DPRINTF(("found leaf page %"Y"u for key [%s]", mp->mp_pgno, + DPRINTF(("found leaf page %"Yu" for key [%s]", mp->mp_pgno, key ? DKEY(key) : "null")); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -6235,7 +6506,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) mc->mc_snum = 1; mc->mc_top = 0; - DPRINTF(("db %d root page %"Y"u has flags 0x%X", + DPRINTF(("db %d root page %"Yu" has flags 0x%X", DDBI(mc), root, mc->mc_pg[0]->mp_flags)); if (flags & MDB_PS_MODIFY) { @@ -6260,7 +6531,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) MDB_ID pn = pg << 1; int rc; - DPRINTF(("free ov page %"Y"u (%d)", pg, ovpages)); + DPRINTF(("free ov page %"Yu" (%d)", pg, ovpages)); /* If the page is dirty or on the spill list we just acquired it, * so we should give it back to our current free list, if any. * Otherwise put it onto the list of pages we freed in this txn. @@ -6301,7 +6572,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) j = ++(dl[0].mid); dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; + return MDB_PROBLEM; } } txn->mt_dirty_room++; @@ -6342,12 +6613,10 @@ mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) pgno_t pgno; int rc; -#ifdef MDB_VL32 - if (mc->mc_ovpg) { - MDB_PAGE_UNREF(mc->mc_txn, mc->mc_ovpg); - mc->mc_ovpg = 0; + if (MC_OVPG(mc)) { + MDB_PAGE_UNREF(mc->mc_txn, MC_OVPG(mc)); + MC_SET_OVPG(mc, NULL); } -#endif if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { data->mv_size = NODEDSZ(leaf); data->mv_data = NODEDATA(leaf); @@ -6359,13 +6628,11 @@ mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { - DPRINTF(("read overflow page %"Y"u failed", pgno)); + DPRINTF(("read overflow page %"Yu" failed", pgno)); return rc; } data->mv_data = METADATA(omp); -#ifdef MDB_VL32 - mc->mc_ovpg = omp; -#endif + MC_SET_OVPG(mc, omp); return MDB_SUCCESS; } @@ -6389,14 +6656,10 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, mdb_cursor_init(&mc, txn, dbi, &mx); rc = mdb_cursor_set(&mc, key, data, MDB_SET, &exact); -#ifdef MDB_VL32 - { - /* unref all the pages - caller must copy the data - * before doing anything else - */ - mdb_cursor_unref(&mc); - } -#endif + /* unref all the pages when MDB_VL32 - caller must copy the data + * before doing anything else + */ + MDB_CURSOR_UNREF(&mc, 1); return rc; } @@ -6426,7 +6689,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) op = mc->mc_pg[mc->mc_top]; #endif mdb_cursor_pop(mc); - DPRINTF(("parent page is page %"Y"u, index %u", + DPRINTF(("parent page is page %"Yu", index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) @@ -6473,14 +6736,20 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - if (mc->mc_flags & C_EOF) { + if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) return MDB_NOTFOUND; - } - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + if (!(mc->mc_flags & C_INITIALIZED)) + return mdb_cursor_first(mc, key, data); mp = mc->mc_pg[mc->mc_top]; + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -6492,13 +6761,9 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } } -#ifdef MDB_VL32 else { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); - } + MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); } -#endif } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_NEXT_DUP) @@ -6506,7 +6771,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF(("cursor_next: top page is %"Y"u in cursor %p", + DPRINTF(("cursor_next: top page is %"Yu" in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; @@ -6520,12 +6785,12 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } mp = mc->mc_pg[mc->mc_top]; - DPRINTF(("next page is %"Y"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + DPRINTF(("next page is %"Yu", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); } else mc->mc_ki[mc->mc_top]++; skip: - DPRINTF(("==> cursor points to page %"Y"u with %u keys, key index %u", + DPRINTF(("==> cursor points to page %"Yu" with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); if (IS_LEAF2(mp)) { @@ -6563,7 +6828,12 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + return rc; + mc->mc_ki[mc->mc_top]++; + } mp = mc->mc_pg[mc->mc_top]; @@ -6580,13 +6850,9 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } } -#ifdef MDB_VL32 else { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); - } + MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); } -#endif } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_PREV_DUP) @@ -6594,7 +6860,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF(("cursor_prev: top page is %"Y"u in cursor %p", + DPRINTF(("cursor_prev: top page is %"Yu" in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); mc->mc_flags &= ~(C_EOF|C_DEL); @@ -6606,13 +6872,11 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } mp = mc->mc_pg[mc->mc_top]; mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - DPRINTF(("prev page is %"Y"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + DPRINTF(("prev page is %"Yu", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); } else mc->mc_ki[mc->mc_top]--; - mc->mc_flags &= ~C_EOF; - - DPRINTF(("==> cursor points to page %"Y"u with %u keys, key index %u", + DPRINTF(("==> cursor points to page %"Yu" with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); if (IS_LEAF2(mp)) { @@ -6656,11 +6920,7 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_BAD_VALSIZE; if (mc->mc_xcursor) { -#ifdef MDB_VL32 - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); - } -#endif + MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); } @@ -6728,6 +6988,7 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } rc = 0; + mc->mc_flags &= ~C_EOF; goto set2; } } @@ -6819,10 +7080,8 @@ set1: if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) return rc; dcmp = mc->mc_dbx->md_dcmp; -#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) - if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(mdb_size_t)) + if (NEED_CMP_CLONG(dcmp, olddata.mv_size)) dcmp = mdb_cmp_clong; -#endif rc = dcmp(data, &olddata); if (rc) { if (op == MDB_GET_BOTH || rc > 0) @@ -6855,11 +7114,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) MDB_node *leaf; if (mc->mc_xcursor) { -#ifdef MDB_VL32 - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); - } -#endif + MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); } @@ -6905,24 +7160,17 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) MDB_node *leaf; if (mc->mc_xcursor) { -#ifdef MDB_VL32 - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); - } -#endif + MDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0); mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); } - if (!(mc->mc_flags & C_EOF)) { - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (rc != MDB_SUCCESS) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; mc->mc_flags |= C_INITIALIZED|C_EOF; leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); @@ -7036,10 +7284,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = MDB_INCOMPATIBLE; break; } - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); if (rc == MDB_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { MDB_cursor *mx; @@ -7081,21 +7326,11 @@ fetchm: case MDB_NEXT: case MDB_NEXT_DUP: case MDB_NEXT_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, op); + rc = mdb_cursor_next(mc, key, data, op); break; case MDB_PREV: case MDB_PREV_DUP: case MDB_PREV_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (rc) - break; - mc->mc_flags |= C_INITIALIZED; - mc->mc_ki[mc->mc_top]++; - } rc = mdb_cursor_prev(mc, key, data, op); break; case MDB_FIRST: @@ -7112,6 +7347,11 @@ fetchm: rc = MDB_INCOMPATIBLE; break; } + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + rc = MDB_NOTFOUND; + break; + } { MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -7153,7 +7393,8 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { + /* Touch DB record of named DB */ MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -7378,10 +7619,8 @@ more: if (flags == MDB_CURRENT) goto current; dcmp = mc->mc_dbx->md_dcmp; -#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) - if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(mdb_size_t)) + if (NEED_CMP_CLONG(dcmp, olddata.mv_size)) dcmp = mdb_cmp_clong; -#endif /* does data match? */ if (!dcmp(data, &olddata)) { if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) @@ -7477,8 +7716,9 @@ prep_subDB: } else { memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, olddata.mv_size - fp->mp_upper - PAGEBASE); + memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); for (i=0; i<NUMKEYS(fp); i++) - mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; + mp->mp_ptrs[i] += offset; } } @@ -7532,8 +7772,13 @@ current: /* Note - this page is already counted in parent's dirty_room */ rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); mdb_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the + * parent txn, in case the user peeks at MDB_RESERVEd + * or unused parts. Some users treat ovpages specially. + */ if (!(flags & MDB_RESERVE)) { - /* Copy end of page, adjusting alignment so + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); @@ -7602,11 +7847,7 @@ new_sub: if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { m3->mc_ki[i]++; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - MDB_node *n2 = NODEPTR(mp, m3->mc_ki[i]); - if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); - } + XCURSOR_REFRESH(m3, i, mp); } } } @@ -7648,7 +7889,6 @@ put_sub: MDB_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; - int nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; @@ -7656,10 +7896,8 @@ put_sub: if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { mdb_xcursor_init2(m2, mx, new_dupdata); - } else if (!insert_key && m2->mc_ki[i] < nkeys) { - MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]); - if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } else if (!insert_key) { + XCURSOR_REFRESH(m2, i, mp); } } } @@ -7701,7 +7939,7 @@ put_sub: return rc; bad_sub: if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ - rc = MDB_CORRUPTED; + rc = MDB_PROBLEM; } mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return rc; @@ -7764,13 +8002,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) continue; if (m2->mc_pg[mc->mc_top] == mp) { - if (m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } else { - MDB_node *n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); - if (!(n2->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); - } + XCURSOR_REFRESH(m2, mc->mc_top, mp); } } } @@ -7815,6 +8047,7 @@ fail: } /** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc a cursor on the database being added to. * @param[in] flags flags defining what type of page is being allocated. * @param[in] num the number of pages to allocate. This is usually 1, @@ -7830,7 +8063,7 @@ mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) if ((rc = mdb_page_alloc(mc, num, &np))) return rc; - DPRINTF(("allocated new mpage %"Y"u, page size %u", + DPRINTF(("allocated new mpage %"Yu", page size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize)); np->mp_flags = flags | P_DIRTY; np->mp_lower = (PAGEHDRSZ-PAGEBASE); @@ -7900,6 +8133,7 @@ mdb_branch_size(MDB_env *env, MDB_val *key) } /** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc The cursor for this operation. * @param[in] indx The index on the page where the new node should be added. * @param[in] key The key for the new node. @@ -7930,7 +8164,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); - DPRINTF(("add to %s %spage %"Y"u index %i, data size %"Z"u key size %"Z"u [%s]", + DPRINTF(("add to %s %spage %"Yu" index %i, data size %"Z"u key size %"Z"u [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, @@ -7971,7 +8205,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, goto full; if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) return rc; - DPRINTF(("allocated overflow page %"Y"u", ofp->mp_pgno)); + DPRINTF(("allocated overflow page %"Yu, ofp->mp_pgno)); flags |= F_BIGDATA; goto update; } else { @@ -8028,7 +8262,7 @@ update: return MDB_SUCCESS; full: - DPRINTF(("not enough room in page %"Y"u, got %u ptrs", + DPRINTF(("not enough room in page %"Yu", got %u ptrs", mdb_dbg_pgno(mp), NUMKEYS(mp))); DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); DPRINTF(("node size = %"Z"u", node_size)); @@ -8051,7 +8285,7 @@ mdb_node_del(MDB_cursor *mc, int ksize) MDB_node *node; char *base; - DPRINTF(("delete node %u on %s page %"Y"u", indx, + DPRINTF(("delete node %u on %s page %"Yu, indx, IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); numkeys = NUMKEYS(mp); mdb_cassert(mc, indx < numkeys); @@ -8160,9 +8394,7 @@ mdb_xcursor_init0(MDB_cursor *mc) mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; -#ifdef MDB_VL32 - mx->mx_cursor.mc_ovpg = 0; -#endif + MC_SET_OVPG(&mx->mx_cursor, NULL); mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP)); mx->mx_dbx.md_name.mv_size = 0; mx->mx_dbx.md_name.mv_data = NULL; @@ -8210,13 +8442,11 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_db.md_flags |= MDB_INTEGERKEY; } } - DPRINTF(("Sub-db -%u root page %"Y"u", mx->mx_cursor.mc_dbi, + DPRINTF(("Sub-db -%u root page %"Yu, mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ -#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) - if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(mdb_size_t)) + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; + if (NEED_CMP_CLONG(mx->mx_dbx.md_cmp, mx->mx_db.md_pad)) mx->mx_dbx.md_cmp = mdb_cmp_clong; -#endif } @@ -8238,8 +8468,8 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags |= C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ -#if UINT_MAX < SIZE_MAX + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; +#if UINT_MAX < MDB_SIZE_MAX /* matches mdb_xcursor_init1:NEED_CMP_CLONG() */ mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; #endif } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { @@ -8247,7 +8477,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) } mx->mx_db = src_mx->mx_db; mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - DPRINTF(("Sub-db -%u root page %"Y"u", mx->mx_cursor.mc_dbi, + DPRINTF(("Sub-db -%u root page %"Yu, mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); } @@ -8266,9 +8496,7 @@ mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) mc->mc_top = 0; mc->mc_pg[0] = 0; mc->mc_ki[0] = 0; -#ifdef MDB_VL32 - mc->mc_ovpg = 0; -#endif + MC_SET_OVPG(mc, NULL); mc->mc_flags = txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP); if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { mdb_tassert(txn, mx != NULL); @@ -8350,9 +8578,15 @@ mdb_cursor_count(MDB_cursor *mc, mdb_size_t *countp) if (!(mc->mc_flags & C_INITIALIZED)) return EINVAL; - if (!mc->mc_snum || (mc->mc_flags & C_EOF)) + if (!mc->mc_snum) return MDB_NOTFOUND; + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { *countp = 1; @@ -8368,13 +8602,14 @@ mdb_cursor_count(MDB_cursor *mc, mdb_size_t *countp) void mdb_cursor_close(MDB_cursor *mc) { -#ifdef MDB_VL32 if (mc) { - mdb_cursor_unref(mc); + MDB_CURSOR_UNREF(mc, 0); } -#endif if (mc && !mc->mc_backup) { - /* remove from txn, if tracked */ + /* Remove from txn, if tracked. + * A read-only txn (!C_UNTRACK) may have been freed already, + * so do not peek inside it. Only write txns track cursors. + */ if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; @@ -8399,6 +8634,7 @@ mdb_cursor_dbi(MDB_cursor *mc) } /** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc Cursor pointing to the node to operate on. * @param[in] key The new key to use. * @return 0 on success, non-zero on failure. @@ -8424,7 +8660,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) char kbuf2[DKBUF_MAXKEYSIZE*2+1]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; - DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Y"u", + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Yu, indx, ptr, mdb_dkey(&k2, kbuf2), DKEY(key), @@ -8572,7 +8808,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) return rc; } - DPRINTF(("moving %s node %u [%s] on page %"Y"u to node %u on page %"Y"u", + DPRINTF(("moving %s node %u [%s] on page %"Yu" to node %u on page %"Yu, IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", csrc->mc_ki[csrc->mc_top], DKEY(&key), @@ -8618,12 +8854,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; m3->mc_ki[csrc->mc_top-1]++; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mps)) { - MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (IS_LEAF(mps)) + XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); } } else /* Adding on the right, bump others down */ @@ -8644,12 +8876,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) } else { m3->mc_ki[csrc->mc_top]--; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mps)) { - MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (IS_LEAF(mps)) + XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); } } } @@ -8666,7 +8894,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - DPRINTF(("update separator for source page %"Y"u to [%s]", + DPRINTF(("update separator for source page %"Yu" to [%s]", csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); mdb_cursor_copy(csrc, &mn); mn.mc_snum--; @@ -8697,7 +8925,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - DPRINTF(("update separator for destination page %"Y"u to [%s]", + DPRINTF(("update separator for destination page %"Yu" to [%s]", cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); mdb_cursor_copy(cdst, &mn); mn.mc_snum--; @@ -8743,7 +8971,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - DPRINTF(("merging page %"Y"u into %"Y"u", psrc->mp_pgno, pdst->mp_pgno)); + DPRINTF(("merging page %"Yu" into %"Yu, psrc->mp_pgno, pdst->mp_pgno)); mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ mdb_cassert(csrc, cdst->mc_snum > 1); @@ -8800,7 +9028,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) } } - DPRINTF(("dst page %"Y"u now has %u keys (%.1f%% filled)", + DPRINTF(("dst page %"Yu" now has %u keys (%.1f%% filled)", pdst->mp_pgno, NUMKEYS(pdst), (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); @@ -8850,12 +9078,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { m3->mc_ki[top-1]--; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(psrc)) { - MDB_node *node = NODEPTR(m3->mc_pg[top], m3->mc_ki[top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); } } { @@ -8888,9 +9112,7 @@ mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; -#ifdef MDB_VL32 - cdst->mc_ovpg = csrc->mc_ovpg; -#endif + MC_SET_OVPG(cdst, MC_OVPG(csrc)); for (i=0; i<csrc->mc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; @@ -8919,14 +9141,14 @@ mdb_rebalance(MDB_cursor *mc) minkeys = 1; thresh = FILL_THRESHOLD; } - DPRINTF(("rebalancing %s page %"Y"u (has %u keys, %.1f%% full)", + DPRINTF(("rebalancing %s page %"Yu" (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - DPRINTF(("no need to rebalance page %"Y"u, above fill threshold", + DPRINTF(("no need to rebalance page %"Yu", above fill threshold", mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); return MDB_SUCCESS; } @@ -9055,7 +9277,7 @@ mdb_rebalance(MDB_cursor *mc) fromleft = 1; } - DPRINTF(("found neighbor page %"Y"u (%u keys, %.1f%% full)", + DPRINTF(("found neighbor page %"Yu" (%u keys, %.1f%% full)", mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); @@ -9112,16 +9334,15 @@ mdb_cursor_del0(MDB_cursor *mc) if (m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] == ki) { m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDB_DUPSORT) - m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } + continue; } else if (m3->mc_ki[mc->mc_top] > ki) { m3->mc_ki[mc->mc_top]--; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + XCURSOR_REFRESH(m3, mc->mc_top, mp); } } } @@ -9147,11 +9368,32 @@ mdb_cursor_del0(MDB_cursor *mc) continue; if (m3->mc_pg[mc->mc_top] == mp) { /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; + } + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { + MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not initd it must be reinited. + * Else if node points to a subDB, nothing is needed. + * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. + */ + if (node->mn_flags & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node->mn_flags & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } else { + mdb_xcursor_init1(m3, node); + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + } } } } @@ -9216,7 +9458,6 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, * run out of space, triggering a split. We need this * cursor to be consistent until the end of the rebalance. */ - mc.mc_flags |= C_UNTRACK; mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; rc = mdb_cursor_del(&mc, flags); @@ -9226,6 +9467,7 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, } /** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. * @param[in,out] mc Cursor pointing to the page and desired insertion index. * The cursor will be updated to point to the actual page and index where * the node got inserted after the split. @@ -9257,7 +9499,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno newindx = mc->mc_ki[mc->mc_top]; nkeys = NUMKEYS(mp); - DPRINTF(("-----> splitting %s page %"Y"u and adding [%s] at index %i/%i", + DPRINTF(("-----> splitting %s page %"Yu" and adding [%s] at index %i/%i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); @@ -9265,7 +9507,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) return rc; rp->mp_pad = mp->mp_pad; - DPRINTF(("new right sibling: page %"Y"u", rp->mp_pgno)); + DPRINTF(("new right sibling: page %"Yu, rp->mp_pgno)); /* Usually when splitting the root page, the cursor * height is 1. But when called from mdb_update_key, @@ -9283,7 +9525,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - DPRINTF(("root split! new root = %"Y"u", pp->mp_pgno)); + DPRINTF(("root split! new root = %"Yu, pp->mp_pgno)); new_root = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ @@ -9300,7 +9542,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno ptop = 0; } else { ptop = mc->mc_top-1; - DPRINTF(("parent branch page is %"Y"u", mc->mc_pg[ptop]->mp_pgno)); + DPRINTF(("parent branch page is %"Yu, mc->mc_pg[ptop]->mp_pgno)); } mdb_cursor_copy(mc, &mn); @@ -9475,7 +9717,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno } else { /* find right page's left sibling */ mc->mc_ki[ptop] = mn.mc_ki[ptop]; - mdb_cursor_sibling(mc, 0); + rc = mdb_cursor_sibling(mc, 0); } } } else { @@ -9484,6 +9726,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mn.mc_top++; } if (rc != MDB_SUCCESS) { + if (rc == MDB_NOTFOUND) /* improper mdb_cursor_sibling() result */ + rc = MDB_PROBLEM; goto done; } if (nflags & MDB_APPEND) { @@ -9635,12 +9879,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { m3->mc_ki[ptop]++; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mp)) { - MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (IS_LEAF(mp)) + XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); } } DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); @@ -9681,23 +9921,26 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, #ifndef MDB_WBUF #define MDB_WBUF (1024*1024) #endif +#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ - /** State needed for a compacting copy. */ + /** State needed for a double-buffering compacting copy. */ typedef struct mdb_copy { + MDB_env *mc_env; + MDB_txn *mc_txn; pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; + pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ char *mc_wbuf[2]; char *mc_over[2]; - MDB_env *mc_env; - MDB_txn *mc_txn; int mc_wlen[2]; int mc_olen[2]; pgno_t mc_next_pgno; HANDLE mc_fd; - int mc_status; - volatile int mc_new; - int mc_toggle; - + int mc_toggle; /**< Buffer number in provider */ + int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; } mdb_copy; /** Dedicated writer thread for compacting copy. */ @@ -9713,26 +9956,38 @@ mdb_env_copythr(void *arg) #else int len; #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#ifdef SIGPIPE + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) + my->mc_error = rc; +#endif #endif pthread_mutex_lock(&my->mc_mutex); - my->mc_new = 0; - pthread_cond_signal(&my->mc_cond); for(;;) { while (!my->mc_new) pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - if (my->mc_new < 0) { - my->mc_new = 0; + if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ break; - } - my->mc_new = 0; wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: - while (wsize > 0) { + rc = MDB_SUCCESS; + while (wsize > 0 && !my->mc_error) { DO_WRITE(rc, my->mc_fd, ptr, wsize, len); if (!rc) { rc = ErrCode(); +#if defined(SIGPIPE) && !defined(_WIN32) + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + } +#endif break; } else if (len > 0) { rc = MDB_SUCCESS; @@ -9745,8 +10000,7 @@ again: } } if (rc) { - my->mc_status = rc; - break; + my->mc_error = rc; } /* If there's an overflow page tail, write it too */ if (my->mc_olen[toggle]) { @@ -9757,38 +10011,45 @@ again: } my->mc_wlen[toggle] = 0; toggle ^= 1; + /* Return the empty buffer to provider */ + my->mc_new--; pthread_cond_signal(&my->mc_cond); } - pthread_cond_signal(&my->mc_cond); pthread_mutex_unlock(&my->mc_mutex); return (THREAD_RET)0; #undef DO_WRITE } - /** Tell the writer thread there's a buffer ready to write */ + /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. + * + * @param[in] my control structure. + * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). + */ static int ESECT -mdb_env_cthr_toggle(mdb_copy *my, int st) +mdb_env_cthr_toggle(mdb_copy *my, int adjust) { - int toggle = my->mc_toggle ^ 1; pthread_mutex_lock(&my->mc_mutex); - if (my->mc_status) { - pthread_mutex_unlock(&my->mc_mutex); - return my->mc_status; - } - while (my->mc_new == 1) - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - my->mc_new = st; - my->mc_toggle = toggle; + my->mc_new += adjust; pthread_cond_signal(&my->mc_cond); + while (my->mc_new & 2) /* both buffers in use */ + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); pthread_mutex_unlock(&my->mc_mutex); - return 0; + + my->mc_toggle ^= (adjust & 1); + /* Both threads reset mc_wlen, to be safe from threading errors */ + my->mc_wlen[my->mc_toggle] = 0; + return my->mc_error; } - /** Depth-first tree traversal for compacting copy. */ + /** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ static int ESECT mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { - MDB_cursor mc; + MDB_cursor mc = {0}; MDB_node *ni; MDB_page *mo, *mp, *leaf; char *buf, *ptr; @@ -9800,8 +10061,8 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) return MDB_SUCCESS; mc.mc_snum = 1; - mc.mc_top = 0; mc.mc_txn = my->mc_txn; + mc.mc_flags = my->mc_txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP); rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); if (rc) @@ -9847,6 +10108,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) } memcpy(&pg, NODEDATA(ni), sizeof(pg)); + memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); rc = mdb_page_get(&mc, pg, &omp, NULL); if (rc) goto done; @@ -9869,7 +10131,6 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) goto done; toggle = my->mc_toggle; } - memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t)); } else if (ni->mn_flags & F_SUBDATA) { MDB_db db; @@ -9947,47 +10208,56 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) { MDB_meta *mm; MDB_page *mp; - mdb_copy my; + mdb_copy my = {0}; MDB_txn *txn = NULL; pthread_t thr; - int rc; + pgno_t root, new_root; + int rc = MDB_SUCCESS; #ifdef _WIN32 - my.mc_mutex = CreateMutex(NULL, FALSE, NULL); - my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL); + if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || + !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { + rc = ErrCode(); + goto done; + } my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); - if (my.mc_wbuf[0] == NULL) - return errno; + if (my.mc_wbuf[0] == NULL) { + /* _aligned_malloc() sets errno, but we use Windows error codes */ + rc = ERROR_NOT_ENOUGH_MEMORY; + goto done; + } #else - pthread_mutex_init(&my.mc_mutex, NULL); - pthread_cond_init(&my.mc_cond, NULL); + if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) + return rc; + if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + goto done2; #ifdef HAVE_MEMALIGN my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); - if (my.mc_wbuf[0] == NULL) - return errno; + if (my.mc_wbuf[0] == NULL) { + rc = errno; + goto done; + } #else - rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2); - if (rc) - return rc; + { + void *p; + if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) + goto done; + my.mc_wbuf[0] = p; + } #endif #endif memset(my.mc_wbuf[0], 0, MDB_WBUF*2); my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; - my.mc_wlen[0] = 0; - my.mc_wlen[1] = 0; - my.mc_olen[0] = 0; - my.mc_olen[1] = 0; my.mc_next_pgno = NUM_METAS; - my.mc_status = 0; - my.mc_new = 1; - my.mc_toggle = 0; my.mc_env = env; my.mc_fd = fd; - THREAD_CREATE(thr, mdb_env_copythr, &my); + rc = THREAD_CREATE(thr, mdb_env_copythr, &my); + if (rc) + goto done; rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); if (rc) - return rc; + goto finish; mp = (MDB_page *)my.mc_wbuf[0]; memset(mp, 0, NUM_METAS * env->me_psize); @@ -10003,57 +10273,64 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) *(MDB_meta *)METADATA(mp) = *mm; mm = (MDB_meta *)METADATA(mp); - /* Count the number of free pages, subtract from lastpg to find - * number of active pages - */ - { + /* Set metapage 1 with current main DB */ + root = new_root = txn->mt_dbs[MAIN_DBI].md_root; + if (root != P_INVALID) { + /* Count free pages + freeDB pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. + */ MDB_ID freecount = 0; MDB_cursor mc; MDB_val key, data; mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; + if (rc != MDB_NOTFOUND) + goto finish; freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + txn->mt_dbs[FREE_DBI].md_leaf_pages + txn->mt_dbs[FREE_DBI].md_overflow_pages; - /* Set metapage 1 */ - mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; + new_root = txn->mt_next_pgno - 1 - freecount; + mm->mm_last_pg = new_root; mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - if (mm->mm_last_pg > NUM_METAS-1) { - mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg; - mm->mm_txnid = 1; - } else { - mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; - } + mm->mm_dbs[MAIN_DBI].md_root = new_root; + } else { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. + */ + mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + } + if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { + mm->mm_txnid = 1; /* use metapage 1 */ } + my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; - pthread_mutex_lock(&my.mc_mutex); - while(my.mc_new) - pthread_cond_wait(&my.mc_cond, &my.mc_mutex); - pthread_mutex_unlock(&my.mc_mutex); - rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0); - if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) - rc = mdb_env_cthr_toggle(&my, 1); - mdb_env_cthr_toggle(&my, -1); - pthread_mutex_lock(&my.mc_mutex); - while(my.mc_new) - pthread_cond_wait(&my.mc_cond, &my.mc_mutex); - pthread_mutex_unlock(&my.mc_mutex); - THREAD_FINISH(thr); + rc = mdb_env_cwalk(&my, &root, 0); + if (rc == MDB_SUCCESS && root != new_root) { + rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + } +finish: + if (rc) + my.mc_error = rc; + mdb_env_cthr_toggle(&my, 1 | MDB_EOF); + rc = THREAD_FINISH(thr); mdb_txn_abort(txn); + +done: #ifdef _WIN32 - CloseHandle(my.mc_cond); - CloseHandle(my.mc_mutex); - _aligned_free(my.mc_wbuf[0]); + if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); + if (my.mc_cond) CloseHandle(my.mc_cond); + if (my.mc_mutex) CloseHandle(my.mc_mutex); #else + free(my.mc_wbuf[0]); pthread_cond_destroy(&my.mc_cond); +done2: pthread_mutex_destroy(&my.mc_mutex); - free(my.mc_wbuf[0]); #endif - return rc; + return rc ? rc : my.mc_error; } /** Copy environment as-is. */ @@ -10174,67 +10451,20 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) int ESECT mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) { - int rc, len; - char *lpath; + int rc; + MDB_name fname; HANDLE newfd = INVALID_HANDLE_VALUE; -#ifdef _WIN32 - wchar_t *wpath; -#endif - - if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; - } else { - len = strlen(path); - len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); - } - /* The destination path must exist, but the destination file must not. - * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. - */ -#ifdef _WIN32 - rc = utf8_to_utf16(lpath, -1, &wpath, NULL); - if (rc) - goto leave; - newfd = CreateFileW(wpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, - FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); - free(wpath); -#else - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); -#endif - if (newfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } - - if (env->me_psize >= env->me_os_psize) { -#ifdef O_DIRECT - /* Set O_DIRECT if the file system supports it */ - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); -#endif -#ifdef F_NOCACHE /* __APPLE__ */ - rc = fcntl(newfd, F_NOCACHE, 1); - if (rc) { - rc = ErrCode(); - goto leave; - } -#endif + rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); + if (rc == MDB_SUCCESS) { + rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); + mdb_fname_destroy(fname); } - - rc = mdb_env_copyfd2(env, newfd, flags); - -leave: - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); - if (newfd != INVALID_HANDLE_VALUE) + if (rc == MDB_SUCCESS) { + rc = mdb_env_copyfd2(env, newfd, flags); if (close(newfd) < 0 && rc == MDB_SUCCESS) rc = ErrCode(); - + } return rc; } @@ -10456,8 +10686,11 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) return MDB_INCOMPATIBLE; - } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { - return rc; + } else { + if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) + return rc; + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; } /* Done here so we cannot fail after creating a new DB */ @@ -10471,7 +10704,8 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db memset(&dummy, 0, sizeof(dummy)); dummy.md_root = P_INVALID; dummy.md_flags = flags & PERSISTENT_FLAGS; - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); + WITH_CURSOR_TRACKING(mc, + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); dbflag |= DB_DIRTY; } @@ -10641,10 +10875,8 @@ pop: done: if (rc) txn->mt_flags |= MDB_TXN_ERROR; -#ifdef MDB_VL32 /* drop refcount for mx's pages */ - mdb_cursor_unref(&mx); -#endif + MDB_CURSOR_UNREF(&mx, 0); } else if (rc == MDB_NOTFOUND) { rc = MDB_SUCCESS; } @@ -10764,7 +10996,7 @@ mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; sprintf(buf, txnid == (txnid_t)-1 ? - "%10d %"Z"x -\n" : "%10d %"Z"x %"Y"u\n", + "%10d %"Z"x -\n" : "%10d %"Z"x %"Yu"\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); if (first) { first = 0; @@ -10833,7 +11065,7 @@ mdb_reader_check(MDB_env *env, int *dead) return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; } -/** As #mdb_reader_check(). rlocked = <caller locked the reader mutex>. */ +/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ static int ESECT mdb_reader_check0(MDB_env *env, int rlocked, int *dead) { @@ -10869,7 +11101,7 @@ mdb_reader_check0(MDB_env *env, int rlocked, int *dead) } for (; j<rdrs; j++) if (mr[j].mr_pid == pid) { - DPRINTF(("clear stale reader pid %u txn %"Y"d", + DPRINTF(("clear stale reader pid %u txn %"Yd, (unsigned) pid, mr[j].mr_txnid)); mr[j].mr_pid = 0; count++; @@ -10936,25 +11168,32 @@ mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) return rc; } #endif /* MDB_ROBUST_SUPPORTED */ -/** @} */ #if defined(_WIN32) -static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsize) -{ - int need; - wchar_t *result; - need = MultiByteToWideChar(CP_UTF8, 0, src, srcsize, NULL, 0); - if (need == 0xFFFD) - return EILSEQ; - if (need == 0) - return EINVAL; - result = malloc(sizeof(wchar_t) * need); - if (!result) - return ENOMEM; - MultiByteToWideChar(CP_UTF8, 0, src, srcsize, result, need); - if (dstsize) - *dstsize = need; - *dst = result; - return 0; +/** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ +static int ESECT +utf8_to_utf16(const char *src, MDB_name *dst, int xtra) +{ + int rc, need = 0; + wchar_t *result = NULL; + for (;;) { /* malloc result, then fill it in */ + need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); + if (!need) { + rc = ErrCode(); + free(result); + return rc; + } + if (!result) { + result = malloc(sizeof(wchar_t) * (need + xtra)); + if (!result) + return ENOMEM; + continue; + } + dst->mn_alloced = 1; + dst->mn_len = need - 1; + dst->mn_val = result; + return MDB_SUCCESS; + } } #endif /* defined(_WIN32) */ +/** @} */ |