region.h 10.7 KB
Newer Older
unknown's avatar
unknown committed
1 2 3
/*-
 * See the file LICENSE for redistribution information.
 *
unknown's avatar
unknown committed
4
 * Copyright (c) 1998-2004
unknown's avatar
unknown committed
5 6
 *	Sleepycat Software.  All rights reserved.
 *
unknown's avatar
unknown committed
7
 * $Id: region.h,v 11.51 2004/10/15 16:59:39 bostic Exp $
unknown's avatar
unknown committed
8 9
 */

unknown's avatar
unknown committed
10 11 12
#ifndef _DB_REGION_H_
#define	_DB_REGION_H_

unknown's avatar
unknown committed
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
/*
 * The DB environment consists of some number of "regions", which are described
 * by the following four structures:
 *
 *	REGENV	   -- shared information about the environment
 *	REGENV_REF -- file describing system memory version of REGENV
 *	REGION	   -- shared information about a single region
 *	REGINFO	   -- per-process information about a REGION
 *
 * There are three types of memory that hold regions:
 *	per-process heap (malloc)
 *	file mapped into memory (mmap, MapViewOfFile)
 *	system memory (shmget, CreateFileMapping)
 *
 * If the regions are private to a process, they're in malloc.  If they're
 * public, they're in file mapped memory, or, optionally, in system memory.
 * Regions in the filesystem are named "__db.001", "__db.002" and so on.  If
 * we're not using a private environment allocated using malloc(3), the file
 * "__db.001" will always exist, as we use it to synchronize on the regions,
 * whether they exist in file mapped memory or system memory.
 *
 * The file "__db.001" contains a REGENV structure and a linked list of some
 * number of REGION structures.  Each of the REGION structures describes and
 * locks one of the underlying shared regions used by DB.
 *
 *	__db.001
 *	+---------+
 *	|REGENV  |
 *	+---------+   +----------+
 *	|REGION   |-> | __db.002 |
 *	|	  |   +----------+
 *	+---------+   +----------+
 *	|REGION   |-> | __db.003 |
 *	|	  |   +----------+
 *	+---------+   +----------+
 *	|REGION   |-> | __db.004 |
 *	|	  |   +----------+
 *	+---------+
 *
 * The only tricky part about manipulating the regions is correctly creating
 * or joining the REGENV file, i.e., __db.001.  We have to be absolutely sure
 * that only one process creates it, and that everyone else joins it without
 * seeing inconsistent data.  Once that region is created, we can use normal
unknown's avatar
unknown committed
56
 * shared locking procedures to do mutual exclusion for all other regions.
unknown's avatar
unknown committed
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
 *
 * One of the REGION structures in the main environment region describes the
 * environment region itself.
 *
 * To lock a region, locate the REGION structure that describes it and acquire
 * the region's mutex.  There is one exception to this rule -- the lock for the
 * environment region itself is in the REGENV structure, and not in the REGION
 * that describes the environment region.  That's so that we can acquire a lock
 * without walking linked lists that could potentially change underneath us.
 * The REGION will not be moved or removed during the life of the region, and
 * so long-lived references to it can be held by the process.
 *
 * All requests to create or join a region return a REGINFO structure, which
 * is held by the caller and used to open and subsequently close the reference
 * to the region.  The REGINFO structure contains the per-process information
 * that we need to access the region.
 *
 * The one remaining complication.  If the regions (including the environment
 * region) live in system memory, and the system memory isn't "named" somehow
 * in the filesystem name space, we need some way of finding it.  Do this by
 * by writing the REGENV_REF structure into the "__db.001" file.  When we find
 * a __db.001 file that is too small to be a real, on-disk environment, we use
 * the information it contains to redirect to the real "__db.001" file/memory.
 * This currently only happens when the REGENV file is in shared system memory.
 *
 * Although DB does not currently grow regions when they run out of memory, it
 * would be possible to do so.  To grow a region, allocate a new region of the
 * appropriate size, then copy the old region over it and insert the additional
 * space into the already existing shalloc arena.  Callers may have to fix up
 * local references, but that should be easy to do.  This failed in historic
 * versions of DB because the region lock lived in the mapped memory, and when
 * it was unmapped and remapped (or copied), threads could lose track of it.
 * Once we moved that lock into a region that is never unmapped, growing should
 * work.  That all said, current versions of DB don't implement region grow
 * because some systems don't support mutex copying, e.g., from OSF1 V4.0:
 *
unknown's avatar
unknown committed
93
 *	The address of an msemaphore structure may be significant.  If the
unknown's avatar
unknown committed
94 95 96 97 98 99 100 101
 *	msemaphore structure contains any value copied from an msemaphore
 *	structure at a different address, the result is undefined.
 */

#if defined(__cplusplus)
extern "C" {
#endif

unknown's avatar
unknown committed
102 103 104 105
#define	DB_REGION_PREFIX	"__db"		/* DB file name prefix. */
#define	DB_REGION_FMT		"__db.%03d"	/* Region file name format. */
#define	DB_REGION_ENV		"__db.001"	/* Primary environment name. */
#define	DB_REGION_NAME_LENGTH	8		/* Length of file names. */
unknown's avatar
unknown committed
106 107 108 109 110 111 112 113 114 115 116

#define	INVALID_REGION_ID	0	/* Out-of-band region ID. */
#define	REGION_ID_ENV		1	/* Primary environment ID. */

typedef enum {
	INVALID_REGION_TYPE=0,		/* Region type. */
	REGION_TYPE_ENV,
	REGION_TYPE_LOCK,
	REGION_TYPE_LOG,
	REGION_TYPE_MPOOL,
	REGION_TYPE_MUTEX,
unknown's avatar
unknown committed
117
	REGION_TYPE_TXN } reg_type_t;
unknown's avatar
unknown committed
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133

#define	INVALID_REGION_SEGID	-1	/* Segment IDs are either shmget(2) or
					 * Win16 segment identifiers.  They are
					 * both stored in a "long", and we need
					 * an out-of-band value.
					 */
/*
 * Nothing can live at region offset 0, because, in all cases, that's where
 * we store *something*.  Lots of code needs an out-of-band value for region
 * offsets, so we use 0.
 */
#define	INVALID_ROFF		0

/* Reference describing system memory version of REGENV. */
typedef struct __db_reg_env_ref {
	roff_t	   size;		/* Region size. */
unknown's avatar
unknown committed
134
	long	   segid;		/* UNIX shmget ID, VxWorks ID. */
unknown's avatar
unknown committed
135 136 137 138 139 140 141 142 143
} REGENV_REF;

/* Per-environment region information. */
typedef struct __db_reg_env {
	/*
	 * !!!
	 * The mutex must be the first entry in the structure to guarantee
	 * correct alignment.
	 */
unknown's avatar
unknown committed
144
	DB_MUTEX   mutex;		/* Environment mutex. */
unknown's avatar
unknown committed
145 146 147 148 149 150 151 152

	/*
	 * !!!
	 * Note, the magic and panic fields are NOT protected by any mutex,
	 * and for this reason cannot be anything more complicated than a
	 * zero/non-zero value.
	 */
	u_int32_t  magic;		/* Valid region magic number. */
unknown's avatar
unknown committed
153
	u_int32_t  envid;		/* Unique environment ID. */
unknown's avatar
unknown committed
154

unknown's avatar
unknown committed
155
	int	   envpanic;		/* Environment is dead. */
unknown's avatar
unknown committed
156 157 158 159 160 161

	int	   majver;		/* Major DB version number. */
	int	   minver;		/* Minor DB version number. */
	int	   patch;		/* Patch DB version number. */

	u_int32_t  init_flags;		/* Flags the env was initialized with.*/
unknown's avatar
unknown committed
162
	roff_t	   cipher_off;		/* Offset of cipher area */
unknown's avatar
unknown committed
163 164 165 166 167 168

					/* List of regions. */
	SH_LIST_HEAD(__db_regionh) regionq;

	u_int32_t  refcnt;		/* References to the environment. */

unknown's avatar
unknown committed
169
	roff_t	   rep_off;		/* Offset of the replication area. */
unknown's avatar
unknown committed
170 171 172 173 174
#define	DB_REGENV_REPLOCKED	0x0001	/* Env locked for rep backup. */
	u_int32_t  flags;		/* Shared environment flags. */
#define	DB_REGENV_TIMEOUT	30	/* Backup timeout. */
	time_t	   op_timestamp;	/* Timestamp for operations. */
	time_t	   rep_timestamp;	/* Timestamp for rep db handles. */
unknown's avatar
unknown committed
175

unknown's avatar
unknown committed
176 177 178 179 180 181 182 183 184 185 186 187 188 189
	size_t	   pad;			/* Guarantee that following memory is
					 * size_t aligned.  This is necessary
					 * because we're going to store the
					 * allocation region information there.
					 */
} REGENV;

/* Per-region shared region information. */
typedef struct __db_region {
	/*
	 * !!!
	 * The mutex must be the first entry in the structure to guarantee
	 * correct alignment.
	 */
unknown's avatar
unknown committed
190
	DB_MUTEX   mutex;		/* Region mutex. */
unknown's avatar
unknown committed
191 192 193

	SH_LIST_ENTRY q;		/* Linked list of REGIONs. */

unknown's avatar
unknown committed
194
	reg_type_t type;		/* Region type. */
unknown's avatar
unknown committed
195 196
	u_int32_t  id;			/* Region id. */

unknown's avatar
unknown committed
197 198
	roff_t	   size_orig;		/* Region size in bytes (original). */
	roff_t	   size;		/* Region size in bytes (adjusted). */
unknown's avatar
unknown committed
199 200 201 202 203 204 205 206 207 208

	roff_t	   primary;		/* Primary data structure offset. */

	long	   segid;		/* UNIX shmget(2), Win16 segment ID. */
} REGION;

/*
 * Per-process/per-attachment information about a single region.
 */
struct __db_reginfo_t {		/* __db_r_attach IN parameters. */
unknown's avatar
unknown committed
209 210
	DB_ENV	   *dbenv;		/* Enclosing environment. */
	reg_type_t  type;		/* Region type. */
unknown's avatar
unknown committed
211 212 213 214 215 216 217
	u_int32_t   id;			/* Region id. */

				/* __db_r_attach OUT parameters. */
	REGION	   *rp;			/* Shared region. */

	char	   *name;		/* Region file name. */

unknown's avatar
unknown committed
218 219
	void	   *addr_orig;		/* Region address (original). */
	void	   *addr;		/* Region address (adjusted). */
unknown's avatar
unknown committed
220 221
	void	   *primary;		/* Primary data structure address. */

unknown's avatar
unknown committed
222 223 224 225 226 227
	size_t	    max_alloc;		/* Maximum bytes allocated. */
	size_t	    allocated;		/* Bytes allocated. */

#ifdef DB_WIN32
	HANDLE	wnt_handle;		/* Win/NT HANDLE. */
#endif
unknown's avatar
unknown committed
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258

#define	REGION_CREATE		0x01	/* Caller created region. */
#define	REGION_CREATE_OK	0x02	/* Caller willing to create region. */
#define	REGION_JOIN_OK		0x04	/* Caller is looking for a match. */
	u_int32_t   flags;
};

/*
 * Mutex maintenance information each subsystem region must keep track
 * of to manage resources adequately.
 */
typedef struct __db_regmaint_stat_t {
	u_int32_t	st_hint_hit;
	u_int32_t	st_hint_miss;
	u_int32_t	st_records;
	u_int32_t	st_clears;
	u_int32_t	st_destroys;
	u_int32_t	st_max_locks;
} REGMAINT_STAT;

typedef struct __db_regmaint_t {
	u_int32_t  reglocks;		/* Maximum # of mutexes we track. */
	u_int32_t  regmutex_hint;	/* Hint for next slot */
	REGMAINT_STAT stat;		/* Stats */
	roff_t	   regmutexes[1];	/* Region mutexes in use. */
} REGMAINT;

/*
 * R_ADDR	Return a per-process address for a shared region offset.
 * R_OFFSET	Return a shared region offset for a per-process address.
 */
unknown's avatar
unknown committed
259 260 261 262 263 264
#define	R_ADDR(reginfop, offset)					\
	(F_ISSET((reginfop)->dbenv, DB_ENV_PRIVATE) ? (void *)(offset) :\
	(void *)((u_int8_t *)((reginfop)->addr) + (offset)))
#define	R_OFFSET(reginfop, p)						\
	(F_ISSET((reginfop)->dbenv, DB_ENV_PRIVATE) ? (roff_t)(p) :	\
	(roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr))
unknown's avatar
unknown committed
265 266 267 268 269 270

/*
 * R_LOCK	Lock/unlock a region.
 * R_UNLOCK
 */
#define	R_LOCK(dbenv, reginfo)						\
unknown's avatar
unknown committed
271
	MUTEX_LOCK(dbenv, &(reginfo)->rp->mutex)
unknown's avatar
unknown committed
272 273 274 275 276
#define	R_UNLOCK(dbenv, reginfo)					\
	MUTEX_UNLOCK(dbenv, &(reginfo)->rp->mutex)

/* PANIC_CHECK:	Check to see if the DB environment is dead. */
#define	PANIC_CHECK(dbenv)						\
unknown's avatar
unknown committed
277
	if (!F_ISSET((dbenv), DB_ENV_NOPANIC) &&			\
unknown's avatar
unknown committed
278
	    (dbenv)->reginfo != NULL && ((REGENV *)			\
unknown's avatar
unknown committed
279 280 281 282
	    ((REGINFO *)(dbenv)->reginfo)->primary)->envpanic != 0)	\
		return (__db_panic_msg(dbenv));

#define	PANIC_SET(dbenv, onoff)						\
unknown's avatar
unknown committed
283 284 285
	if ((dbenv)->reginfo != NULL)					\
		((REGENV *)((REGINFO *)					\
		    (dbenv)->reginfo)->primary)->envpanic = (onoff);
unknown's avatar
unknown committed
286 287

/*
unknown's avatar
unknown committed
288 289 290
 * All regions are created on 8K boundaries out of sheer paranoia, so we
 * don't make some underlying VM unhappy. Make sure we don't overflow or
 * underflow.
unknown's avatar
unknown committed
291 292
 */
#define	OS_VMPAGESIZE		(8 * 1024)
unknown's avatar
unknown committed
293 294
#define	OS_VMROUNDOFF(i) {						\
	if ((i) <							\
unknown's avatar
unknown committed
295
	    (UINT32_MAX - OS_VMPAGESIZE) + 1 || (i) < OS_VMPAGESIZE)	\
unknown's avatar
unknown committed
296 297 298
		(i) += OS_VMPAGESIZE - 1;				\
	(i) -= (i) % OS_VMPAGESIZE;					\
}
unknown's avatar
unknown committed
299 300 301 302

#if defined(__cplusplus)
}
#endif
unknown's avatar
unknown committed
303
#endif /* !_DB_REGION_H_ */