Commit 59131d8e authored by Philipp Reisner's avatar Philipp Reisner

Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block into for-2.6.33

parents 012abeea 4f570f99
This diff is collapsed.
This diff is collapsed.
Description
DRBD is a shared-nothing, synchronously replicated block device. It
is designed to serve as a building block for high availability
clusters and in this context, is a "drop-in" replacement for shared
storage. Simplistically, you could see it as a network RAID 1.
Please visit http://www.drbd.org to find out more.
The here included files are intended to help understand the implementation
DRBD-8.3-data-packets.svg, DRBD-data-packets.svg
relates some functions, and write packets.
conn-states-8.dot, disk-states-8.dot, node-states-8.dot
The sub graphs of DRBD's state transitions
digraph conn_states {
StandAllone -> WFConnection [ label = "ioctl_set_net()" ]
WFConnection -> Unconnected [ label = "unable to bind()" ]
WFConnection -> WFReportParams [ label = "in connect() after accept" ]
WFReportParams -> StandAllone [ label = "checks in receive_param()" ]
WFReportParams -> Connected [ label = "in receive_param()" ]
WFReportParams -> WFBitMapS [ label = "sync_handshake()" ]
WFReportParams -> WFBitMapT [ label = "sync_handshake()" ]
WFBitMapS -> SyncSource [ label = "receive_bitmap()" ]
WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ]
SyncSource -> Connected
SyncTarget -> Connected
SyncSource -> PausedSyncS
SyncTarget -> PausedSyncT
PausedSyncS -> SyncSource
PausedSyncT -> SyncTarget
Connected -> WFConnection [ label = "* on network error" ]
}
digraph disk_states {
Diskless -> Inconsistent [ label = "ioctl_set_disk()" ]
Diskless -> Consistent [ label = "ioctl_set_disk()" ]
Diskless -> Outdated [ label = "ioctl_set_disk()" ]
Consistent -> Outdated [ label = "receive_param()" ]
Consistent -> UpToDate [ label = "receive_param()" ]
Consistent -> Inconsistent [ label = "start resync" ]
Outdated -> Inconsistent [ label = "start resync" ]
UpToDate -> Inconsistent [ label = "ioctl_replicate" ]
Inconsistent -> UpToDate [ label = "resync completed" ]
Consistent -> Failed [ label = "io completion error" ]
Outdated -> Failed [ label = "io completion error" ]
UpToDate -> Failed [ label = "io completion error" ]
Inconsistent -> Failed [ label = "io completion error" ]
Failed -> Diskless [ label = "sending notify to peer" ]
}
// vim: set sw=2 sts=2 :
digraph {
rankdir=BT
bgcolor=white
node [shape=plaintext]
node [fontcolor=black]
StandAlone [ style=filled,fillcolor=gray,label=StandAlone ]
node [fontcolor=lightgray]
Unconnected [ label=Unconnected ]
CommTrouble [ shape=record,
label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
node [fontcolor=gray]
subgraph cluster_try_connect {
label="try to connect, handshake"
rank=max
WFConnection [ label=WFConnection ]
WFReportParams [ label=WFReportParams ]
}
TearDown [ label=TearDown ]
Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
node [fontcolor=lightblue]
StartingSyncS [ label=StartingSyncS ]
StartingSyncT [ label=StartingSyncT ]
subgraph cluster_bitmap_exchange {
node [fontcolor=red]
fontcolor=red
label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
WFBitMapT [ label=WFBitMapT ]
WFSyncUUID [ label=WFSyncUUID ]
WFBitMapS [ label=WFBitMapS ]
}
node [fontcolor=blue]
cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
node [shape=box,fontcolor=black]
// drbdadm [label="drbdadm connect"]
// handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
// comm_error [label="communication trouble"]
//
// edges
// --------------------------------------
StandAlone -> Unconnected [ label="drbdadm connect" ]
Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ]
Unconnected -> WFConnection [ label="receiver thread is started" ]
WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ]
WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
WFReportParams -> WFBitMapS
WFReportParams -> WFBitMapT
WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
WFBitMapS -> cluster_resync:S
WFSyncUUID -> cluster_resync:T
edge [color=green]
cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
edge [color=red]
WFReportParams -> CommTrouble
Connected -> CommTrouble
cluster_resync:any -> CommTrouble
edge [color=black]
CommTrouble -> Unconnected [label="receiver thread is stopped" ]
}
digraph node_states {
Secondary -> Primary [ label = "ioctl_set_state()" ]
Primary -> Secondary [ label = "ioctl_set_state()" ]
}
digraph peer_states {
Secondary -> Primary [ label = "recv state packet" ]
Primary -> Secondary [ label = "recv state packet" ]
Primary -> Unknown [ label = "connection lost" ]
Secondary -> Unknown [ label = "connection lost" ]
Unknown -> Primary [ label = "connected" ]
Unknown -> Secondary [ label = "connected" ]
}
......@@ -1790,6 +1790,19 @@ S: Maintained
F: drivers/scsi/dpt*
F: drivers/scsi/dpt/
DRBD DRIVER
P: Philipp Reisner
P: Lars Ellenberg
M: drbd-dev@lists.linbit.com
L: drbd-user@lists.linbit.com
W: http://www.drbd.org
T: git git://git.drbd.org/linux-2.6-drbd.git drbd
T: git git://git.drbd.org/drbd-8.3.git
S: Supported
F: drivers/block/drbd/
F: lib/lru_cache.c
F: Documentation/blockdev/drbd/
DRIVER CORE, KOBJECTS, AND SYSFS
M: Greg Kroah-Hartman <gregkh@suse.de>
T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
......
......@@ -12,24 +12,14 @@ config IOSCHED_NOOP
that do their own scheduling and require only minimal assistance from
the kernel.
config IOSCHED_AS
tristate "Anticipatory I/O scheduler"
default y
---help---
The anticipatory I/O scheduler is generally a good choice for most
environments, but is quite large and complex when compared to the
deadline I/O scheduler, it can also be slower in some cases
especially some database loads.
config IOSCHED_DEADLINE
tristate "Deadline I/O scheduler"
default y
---help---
The deadline I/O scheduler is simple and compact, and is often as
good as the anticipatory I/O scheduler, and in some database
workloads, better. In the case of a single process performing I/O to
a disk at any one time, its behaviour is almost identical to the
anticipatory I/O scheduler and so is a good choice.
The deadline I/O scheduler is simple and compact. It will provide
CSCAN service with FIFO expiration of requests, switching to
a new point in the service tree and doing a batch of IO from there
in case of expiry.
config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
......@@ -37,7 +27,9 @@ config IOSCHED_CFQ
---help---
The CFQ I/O scheduler tries to distribute bandwidth equally
among all processes in the system. It should provide a fair
working environment, suitable for desktop systems.
and low latency working environment, suitable for both desktop
and server systems.
This is the default I/O scheduler.
choice
......@@ -47,9 +39,6 @@ choice
Select the I/O scheduler which will be used by default for all
block devices.
config DEFAULT_AS
bool "Anticipatory" if IOSCHED_AS=y
config DEFAULT_DEADLINE
bool "Deadline" if IOSCHED_DEADLINE=y
......@@ -63,7 +52,6 @@ endchoice
config DEFAULT_IOSCHED
string
default "anticipatory" if DEFAULT_AS
default "deadline" if DEFAULT_DEADLINE
default "cfq" if DEFAULT_CFQ
default "noop" if DEFAULT_NOOP
......
......@@ -9,7 +9,6 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
......
This diff is collapsed.
This diff is collapsed.
......@@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
spin_unlock(&elv_list_lock);
if (!strcmp(name, "anticipatory"))
sprintf(elv, "as-iosched");
else
sprintf(elv, "%s-iosched", name);
sprintf(elv, "%s-iosched", name);
request_module("%s", elv);
spin_lock(&elv_list_lock);
......@@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
* Be backwards-compatible with previous kernels, so users
* won't get the wrong elevator.
*/
if (!strcmp(str, "as"))
strcpy(chosen_elevator, "anticipatory");
else
strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
return 1;
}
......
......@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
instead, which can be configured to be on-disk compatible with the
cryptoloop device.
source "drivers/block/drbd/Kconfig"
config BLK_DEV_NBD
tristate "Network block device support"
depends on NET
......
......@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
swim_mod-objs := swim.o swim_asm.o
#
# DRBD device driver configuration
#
comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
depends on !PROC_FS || !INET || !CONNECTOR
config BLK_DEV_DRBD
tristate "DRBD Distributed Replicated Block Device support"
depends on PROC_FS && INET && CONNECTOR
select LRU_CACHE
default n
help
NOTE: In order to authenticate connections you have to select
CRYPTO_HMAC and a hash function as well.
DRBD is a shared-nothing, synchronously replicated block device. It
is designed to serve as a building block for high availability
clusters and in this context, is a "drop-in" replacement for shared
storage. Simplistically, you could see it as a network RAID 1.
Each minor device has a role, which can be 'primary' or 'secondary'.
On the node with the primary device the application is supposed to
run and to access the device (/dev/drbdX). Every write is sent to
the local 'lower level block device' and, across the network, to the
node with the device in 'secondary' state. The secondary device
simply writes the data to its lower level block device.
DRBD can also be used in dual-Primary mode (device writable on both
nodes), which means it can exhibit shared disk semantics in a
shared-nothing cluster. Needless to say, on top of dual-Primary
DRBD utilizing a cluster file system is necessary to maintain for
cache coherency.
For automatic failover you need a cluster manager (e.g. heartbeat).
See also: http://www.drbd.org/, http://www.linux-ha.org
If unsure, say N.
config DRBD_FAULT_INJECTION
bool "DRBD fault injection"
depends on BLK_DEV_DRBD
help
Say Y here if you want to simulate IO errors, in order to test DRBD's
behavior.
The actual simulation of IO errors is done by writing 3 values to
/sys/module/drbd/parameters/
enable_faults: bitmask of...
1 meta data write
2 read
4 resync data write
8 read
16 data write
32 data read
64 read ahead
128 kmalloc of bitmap
256 allocation of EE (epoch_entries)
fault_devs: bitmask of minor numbers
fault_rate: frequency in percent
Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
echo 16 > /sys/module/drbd/parameters/enable_faults
echo 1 > /sys/module/drbd/parameters/fault_devs
echo 5 > /sys/module/drbd/parameters/fault_rate
If unsure, say N.
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/*
drbd.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/drbd.h>
static const char *drbd_conn_s_names[] = {
[C_STANDALONE] = "StandAlone",
[C_DISCONNECTING] = "Disconnecting",
[C_UNCONNECTED] = "Unconnected",
[C_TIMEOUT] = "Timeout",
[C_BROKEN_PIPE] = "BrokenPipe",
[C_NETWORK_FAILURE] = "NetworkFailure",
[C_PROTOCOL_ERROR] = "ProtocolError",
[C_WF_CONNECTION] = "WFConnection",
[C_WF_REPORT_PARAMS] = "WFReportParams",
[C_TEAR_DOWN] = "TearDown",
[C_CONNECTED] = "Connected",
[C_STARTING_SYNC_S] = "StartingSyncS",
[C_STARTING_SYNC_T] = "StartingSyncT",
[C_WF_BITMAP_S] = "WFBitMapS",
[C_WF_BITMAP_T] = "WFBitMapT",
[C_WF_SYNC_UUID] = "WFSyncUUID",
[C_SYNC_SOURCE] = "SyncSource",
[C_SYNC_TARGET] = "SyncTarget",
[C_PAUSED_SYNC_S] = "PausedSyncS",
[C_PAUSED_SYNC_T] = "PausedSyncT",
[C_VERIFY_S] = "VerifyS",
[C_VERIFY_T] = "VerifyT",
};
static const char *drbd_role_s_names[] = {
[R_PRIMARY] = "Primary",
[R_SECONDARY] = "Secondary",
[R_UNKNOWN] = "Unknown"
};
static const char *drbd_disk_s_names[] = {
[D_DISKLESS] = "Diskless",
[D_ATTACHING] = "Attaching",
[D_FAILED] = "Failed",
[D_NEGOTIATING] = "Negotiating",
[D_INCONSISTENT] = "Inconsistent",
[D_OUTDATED] = "Outdated",
[D_UNKNOWN] = "DUnknown",
[D_CONSISTENT] = "Consistent",
[D_UP_TO_DATE] = "UpToDate",
};
static const char *drbd_state_sw_errors[] = {
[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
[-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
[-SS_DEVICE_IN_USE] = "Device is held open by someone",
[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
};
const char *drbd_conn_str(enum drbd_conns s)
{
/* enums are unsigned... */
return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
}
const char *drbd_role_str(enum drbd_role s)
{
return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
}
const char *drbd_disk_str(enum drbd_disk_state s)
{
return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
}
const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
{
return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
err > SS_TWO_PRIMARIES ? "TOO_LARGE"
: drbd_state_sw_errors[-err];
}
This diff is collapsed.
This diff is collapsed.
#ifndef _DRBD_WRAPPERS_H
#define _DRBD_WRAPPERS_H
#include <linux/ctype.h>
#include <linux/mm.h>
/* see get_sb_bdev and bd_claim */
extern char *drbd_sec_holder;
/* sets the number of 512 byte sectors of our virtual device */
static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
sector_t size)
{
/* set_capacity(mdev->this_bdev->bd_disk, size); */
set_capacity(mdev->vdisk, size);
mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
}
#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
static inline int drbd_bio_has_active_page(struct bio *bio)
{
struct bio_vec *bvec;
int i;
__bio_for_each_segment(bvec, bio, i, 0) {
if (page_count(bvec->bv_page) > 1)
return 1;
}
return 0;
}
/* bi_end_io handlers */
extern void drbd_md_io_complete(struct bio *bio, int error);
extern void drbd_endio_read_sec(struct bio *bio, int error);
extern void drbd_endio_write_sec(struct bio *bio, int error);
extern void drbd_endio_pri(struct bio *bio, int error);
/*
* used to submit our private bio
*/
static inline void drbd_generic_make_request(struct drbd_conf *mdev,
int fault_type, struct bio *bio)
{
__release(local);
if (!bio->bi_bdev) {
printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
"bio->bi_bdev == NULL\n",
mdev_to_minor(mdev));
dump_stack();
bio_endio(bio, -ENODEV);
return;
}
if (FAULT_ACTIVE(mdev, fault_type))
bio_endio(bio, -EIO);
else
generic_make_request(bio);
}
static inline void drbd_plug_device(struct drbd_conf *mdev)
{
struct request_queue *q;
q = bdev_get_queue(mdev->this_bdev);
spin_lock_irq(q->queue_lock);
/* XXX the check on !blk_queue_plugged is redundant,
* implicitly checked in blk_plug_device */
if (!blk_queue_plugged(q)) {
blk_plug_device(q);
del_timer(&q->unplug_timer);
/* unplugging should not happen automatically... */
}
spin_unlock_irq(q->queue_lock);
}
static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
{
return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
== CRYPTO_ALG_TYPE_HASH;
}
#ifndef __CHECKER__
# undef __cond_lock
# define __cond_lock(x,c) (c)
#endif
#endif
This diff is collapsed.
......@@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
return sync_blockdev(I_BDEV(filp->f_mapping->host));
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
int error;
error = sync_blockdev(bdev);
if (error)
return error;
error = blkdev_issue_flush(bdev, NULL);
if (error == -EOPNOTSUPP)
error = 0;
return error;
}
/*
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment