Commit 8abfc6e7 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-2.6.37/drivers' of git://git.kernel.dk/linux-2.6-block

* 'for-2.6.37/drivers' of git://git.kernel.dk/linux-2.6-block: (95 commits)
  cciss: fix PCI IDs for new Smart Array controllers
  drbd: add race-breaker to drbd_go_diskless
  drbd: use dynamic_dev_dbg to optionally log uuid changes
  dynamic_debug.h: Fix dynamic_dev_dbg() macro if CONFIG_DYNAMIC_DEBUG not set
  drbd: cleanup: change "<= 0" to "== 0"
  drbd: relax the grace period of the md_sync timer again
  drbd: add some more explicit drbd_md_sync
  drbd: drop wrong debug asserts, fix recently introduced race
  drbd: cleanup useless leftover warn/error printk's
  drbd: add explicit drbd_md_sync to drbd_resync_finished
  drbd: Do not log an ASSERT for P_OV_REQUEST packets while C_CONNECTED
  drbd: fix for possible deadlock on IO error during resync
  drbd: fix unlikely access after free and list corruption
  drbd: fix for spurious fullsync (uuids rotated too fast)
  drbd: allow for explicit resync-finished notifications
  drbd: preparation commit, using full state in receive_state()
  drbd: drbd_send_ack_dp must not rely on header information
  drbd: Fix regression in recv_bm_rle_bits (compressed bitmap)
  drbd: Fixed a stupid copy and paste error
  drbd: Allow larger values for c-fill-target.
  ...

Fix up trivial conflict in drivers/block/ataflop.c due to BKL removal
parents e9dd2b68 6362beea
......@@ -115,8 +115,6 @@ static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it does
module_param(fd_def_df0, ulong, 0);
MODULE_LICENSE("GPL");
static struct request_queue *floppy_queue;
/*
* Macros
*/
......@@ -165,6 +163,7 @@ static volatile int selected = -1; /* currently selected drive */
static int writepending;
static int writefromint;
static char *raw_buf;
static int fdc_queue;
static DEFINE_SPINLOCK(amiflop_lock);
......@@ -1335,6 +1334,42 @@ static int get_track(int drive, int track)
return -1;
}
/*
* Round-robin between our available drives, doing one request from each
*/
static struct request *set_next_request(void)
{
struct request_queue *q;
int cnt = FD_MAX_UNITS;
struct request *rq;
/* Find next queue we can dispatch from */
fdc_queue = fdc_queue + 1;
if (fdc_queue == FD_MAX_UNITS)
fdc_queue = 0;
for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) {
if (unit[fdc_queue].type->code == FD_NODRIVE) {
if (++fdc_queue == FD_MAX_UNITS)
fdc_queue = 0;
continue;
}
q = unit[fdc_queue].gendisk->queue;
if (q) {
rq = blk_fetch_request(q);
if (rq)
break;
}
if (++fdc_queue == FD_MAX_UNITS)
fdc_queue = 0;
}
return rq;
}
static void redo_fd_request(void)
{
struct request *rq;
......@@ -1346,7 +1381,7 @@ static void redo_fd_request(void)
int err;
next_req:
rq = blk_fetch_request(floppy_queue);
rq = set_next_request();
if (!rq) {
/* Nothing left to do */
return;
......@@ -1683,6 +1718,13 @@ static int __init fd_probe_drives(void)
continue;
}
unit[drive].gendisk = disk;
disk->queue = blk_init_queue(do_fd_request, &amiflop_lock);
if (!disk->queue) {
unit[drive].type->code = FD_NODRIVE;
continue;
}
drives++;
if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
printk("no mem for ");
......@@ -1696,7 +1738,6 @@ static int __init fd_probe_drives(void)
disk->fops = &floppy_fops;
sprintf(disk->disk_name, "fd%d", drive);
disk->private_data = &unit[drive];
disk->queue = floppy_queue;
set_capacity(disk, 880*2);
add_disk(disk);
}
......@@ -1744,11 +1785,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
goto out_irq2;
}
ret = -ENOMEM;
floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock);
if (!floppy_queue)
goto out_queue;
ret = -ENODEV;
if (fd_probe_drives() < 1) /* No usable drives */
goto out_probe;
......@@ -1792,8 +1828,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
return 0;
out_probe:
blk_cleanup_queue(floppy_queue);
out_queue:
free_irq(IRQ_AMIGA_CIAA_TB, NULL);
out_irq2:
free_irq(IRQ_AMIGA_DSKBLK, NULL);
......@@ -1811,9 +1845,12 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
for( i = 0; i < FD_MAX_UNITS; i++) {
if (unit[i].type->code != FD_NODRIVE) {
struct request_queue *q = unit[i].gendisk->queue;
del_gendisk(unit[i].gendisk);
put_disk(unit[i].gendisk);
kfree(unit[i].trackbuf);
if (q)
blk_cleanup_queue(q);
}
}
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
......@@ -1821,7 +1858,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
free_irq(IRQ_AMIGA_DSKBLK, NULL);
custom.dmacon = DMAF_DISK; /* disable DMA */
amiga_chip_free(raw_buf);
blk_cleanup_queue(floppy_queue);
unregister_blkdev(FLOPPY_MAJOR, "fd");
}
#endif
......
......@@ -80,8 +80,8 @@
#undef DEBUG
static DEFINE_MUTEX(ataflop_mutex);
static struct request_queue *floppy_queue;
static struct request *fd_request;
static int fdc_queue;
/* Disk types: DD, HD, ED */
static struct atari_disk_type {
......@@ -1392,6 +1392,29 @@ static void setup_req_params( int drive )
ReqTrack, ReqSector, (unsigned long)ReqData ));
}
/*
* Round-robin between our available drives, doing one request from each
*/
static struct request *set_next_request(void)
{
struct request_queue *q;
int old_pos = fdc_queue;
struct request *rq;
do {
q = unit[fdc_queue].disk->queue;
if (++fdc_queue == FD_MAX_UNITS)
fdc_queue = 0;
if (q) {
rq = blk_fetch_request(q);
if (rq)
break;
}
} while (fdc_queue != old_pos);
return rq;
}
static void redo_fd_request(void)
{
......@@ -1406,7 +1429,7 @@ static void redo_fd_request(void)
repeat:
if (!fd_request) {
fd_request = blk_fetch_request(floppy_queue);
fd_request = set_next_request();
if (!fd_request)
goto the_end;
}
......@@ -1933,10 +1956,6 @@ static int __init atari_floppy_init (void)
PhysTrackBuffer = virt_to_phys(TrackBuffer);
BufferDrive = BufferSide = BufferTrack = -1;
floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock);
if (!floppy_queue)
goto Enomem;
for (i = 0; i < FD_MAX_UNITS; i++) {
unit[i].track = -1;
unit[i].flags = 0;
......@@ -1945,7 +1964,10 @@ static int __init atari_floppy_init (void)
sprintf(unit[i].disk->disk_name, "fd%d", i);
unit[i].disk->fops = &floppy_fops;
unit[i].disk->private_data = &unit[i];
unit[i].disk->queue = floppy_queue;
unit[i].disk->queue = blk_init_queue(do_fd_request,
&ataflop_lock);
if (!unit[i].disk->queue)
goto Enomem;
set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
add_disk(unit[i].disk);
}
......@@ -1960,10 +1982,14 @@ static int __init atari_floppy_init (void)
return 0;
Enomem:
while (i--)
while (i--) {
struct request_queue *q = unit[i].disk->queue;
put_disk(unit[i].disk);
if (floppy_queue)
blk_cleanup_queue(floppy_queue);
if (q)
blk_cleanup_queue(q);
}
unregister_blkdev(FLOPPY_MAJOR, "fd");
return -ENOMEM;
}
......@@ -2012,12 +2038,14 @@ static void __exit atari_floppy_exit(void)
int i;
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
for (i = 0; i < FD_MAX_UNITS; i++) {
struct request_queue *q = unit[i].disk->queue;
del_gendisk(unit[i].disk);
put_disk(unit[i].disk);
blk_cleanup_queue(q);
}
unregister_blkdev(FLOPPY_MAJOR, "fd");
blk_cleanup_queue(floppy_queue);
del_timer_sync(&fd_timer);
atari_stram_free( DMABuffer );
}
......
......@@ -105,11 +105,12 @@ static const struct pci_device_id cciss_pci_device_id[] = {
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324A},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324B},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3250},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3251},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3252},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3253},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3254},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3350},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3351},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3352},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3353},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3354},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSF, 0x103C, 0x3355},
{0,}
};
......@@ -149,11 +150,12 @@ static struct board_type products[] = {
{0x3249103C, "Smart Array P812", &SA5_access},
{0x324A103C, "Smart Array P712m", &SA5_access},
{0x324B103C, "Smart Array P711m", &SA5_access},
{0x3250103C, "Smart Array", &SA5_access},
{0x3251103C, "Smart Array", &SA5_access},
{0x3252103C, "Smart Array", &SA5_access},
{0x3253103C, "Smart Array", &SA5_access},
{0x3254103C, "Smart Array", &SA5_access},
{0x3350103C, "Smart Array", &SA5_access},
{0x3351103C, "Smart Array", &SA5_access},
{0x3352103C, "Smart Array", &SA5_access},
{0x3353103C, "Smart Array", &SA5_access},
{0x3354103C, "Smart Array", &SA5_access},
{0x3355103C, "Smart Array", &SA5_access},
};
/* How long to wait (in milliseconds) for board to go into simple mode */
......@@ -1232,164 +1234,138 @@ static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c)
c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
(void)check_for_unit_attention(h, c);
}
/*
* ioctl
*/
static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct gendisk *disk = bdev->bd_disk;
ctlr_info_t *h = get_host(disk);
drive_info_struct *drv = get_drv(disk);
void __user *argp = (void __user *)arg;
dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
cmd, arg);
switch (cmd) {
case CCISS_GETPCIINFO:
{
static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
{
cciss_pci_info_struct pciinfo;
if (!arg)
if (!argp)
return -EINVAL;
pciinfo.domain = pci_domain_nr(h->pdev->bus);
pciinfo.bus = h->pdev->bus->number;
pciinfo.dev_fn = h->pdev->devfn;
pciinfo.board_id = h->board_id;
if (copy_to_user
(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
return -EFAULT;
return 0;
}
case CCISS_GETINTINFO:
{
}
static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
{
cciss_coalint_struct intinfo;
if (!arg)
if (!argp)
return -EINVAL;
intinfo.delay =
readl(&h->cfgtable->HostWrite.CoalIntDelay);
intinfo.count =
readl(&h->cfgtable->HostWrite.CoalIntCount);
intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay);
intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount);
if (copy_to_user
(argp, &intinfo, sizeof(cciss_coalint_struct)))
return -EFAULT;
return 0;
}
case CCISS_SETINTINFO:
{
}
static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
{
cciss_coalint_struct intinfo;
unsigned long flags;
int i;
if (!arg)
if (!argp)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user
(&intinfo, argp, sizeof(cciss_coalint_struct)))
if (copy_from_user(&intinfo, argp, sizeof(intinfo)))
return -EFAULT;
if ((intinfo.delay == 0) && (intinfo.count == 0))
return -EINVAL;
spin_lock_irqsave(&h->lock, flags);
/* Update the field, and then ring the doorbell */
writel(intinfo.delay,
&(h->cfgtable->HostWrite.CoalIntDelay));
writel(intinfo.count,
&(h->cfgtable->HostWrite.CoalIntCount));
writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay));
writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount));
writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
if (!(readl(h->vaddr + SA5_DOORBELL)
& CFGTBL_ChangeReq))
if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
break;
/* delay and try again */
udelay(1000);
udelay(1000); /* delay and try again */
}
spin_unlock_irqrestore(&h->lock, flags);
if (i >= MAX_IOCTL_CONFIG_WAIT)
return -EAGAIN;
return 0;
}
case CCISS_GETNODENAME:
{
}
static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
{
NodeName_type NodeName;
int i;
if (!arg)
if (!argp)
return -EINVAL;
for (i = 0; i < 16; i++)
NodeName[i] =
readb(&h->cfgtable->ServerName[i]);
NodeName[i] = readb(&h->cfgtable->ServerName[i]);
if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
return -EFAULT;
return 0;
}
case CCISS_SETNODENAME:
{
}
static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
{
NodeName_type NodeName;
unsigned long flags;
int i;
if (!arg)
if (!argp)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user
(NodeName, argp, sizeof(NodeName_type)))
if (copy_from_user(NodeName, argp, sizeof(NodeName_type)))
return -EFAULT;
spin_lock_irqsave(&h->lock, flags);
/* Update the field, and then ring the doorbell */
for (i = 0; i < 16; i++)
writeb(NodeName[i],
&h->cfgtable->ServerName[i]);
writeb(NodeName[i], &h->cfgtable->ServerName[i]);
writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
if (!(readl(h->vaddr + SA5_DOORBELL)
& CFGTBL_ChangeReq))
if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
break;
/* delay and try again */
udelay(1000);
udelay(1000); /* delay and try again */
}
spin_unlock_irqrestore(&h->lock, flags);
if (i >= MAX_IOCTL_CONFIG_WAIT)
return -EAGAIN;
return 0;
}
}
case CCISS_GETHEARTBEAT:
{
static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
{
Heartbeat_type heartbeat;
if (!arg)
if (!argp)
return -EINVAL;
heartbeat = readl(&h->cfgtable->HeartBeat);
if (copy_to_user
(argp, &heartbeat, sizeof(Heartbeat_type)))
if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type)))
return -EFAULT;
return 0;
}
case CCISS_GETBUSTYPES:
{
}
static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
{
BusTypes_type BusTypes;
if (!arg)
if (!argp)
return -EINVAL;
BusTypes = readl(&h->cfgtable->BusTypes);
if (copy_to_user
(argp, &BusTypes, sizeof(BusTypes_type)))
if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type)))
return -EFAULT;
return 0;
}
case CCISS_GETFIRMVER:
{
}
static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
{
FirmwareVer_type firmware;
if (!arg)
if (!argp)
return -EINVAL;
memcpy(firmware, h->firm_ver, 4);
......@@ -1397,46 +1373,44 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
(argp, firmware, sizeof(FirmwareVer_type)))
return -EFAULT;
return 0;
}
case CCISS_GETDRIVVER:
{
}
static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
{
DriverVer_type DriverVer = DRIVER_VERSION;
if (!arg)
if (!argp)
return -EINVAL;
if (copy_to_user
(argp, &DriverVer, sizeof(DriverVer_type)))
if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type)))
return -EFAULT;
return 0;
}
case CCISS_DEREGDISK:
case CCISS_REGNEWD:
case CCISS_REVALIDVOLS:
return rebuild_lun_table(h, 0, 1);
}
case CCISS_GETLUNINFO:{
static int cciss_getluninfo(ctlr_info_t *h,
struct gendisk *disk, void __user *argp)
{
LogvolInfo_struct luninfo;
drive_info_struct *drv = get_drv(disk);
memcpy(&luninfo.LunID, drv->LunID,
sizeof(luninfo.LunID));
if (!argp)
return -EINVAL;
memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID));
luninfo.num_opens = drv->usage_count;
luninfo.num_parts = 0;
if (copy_to_user(argp, &luninfo,
sizeof(LogvolInfo_struct)))
if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct)))
return -EFAULT;
return 0;
}
case CCISS_PASSTHRU:
{
}
static int cciss_passthru(ctlr_info_t *h, void __user *argp)
{
IOCTL_Command_struct iocommand;
CommandList_struct *c;
char *buff = NULL;
u64bit temp64;
DECLARE_COMPLETION_ONSTACK(wait);
if (!arg)
if (!argp)
return -EINVAL;
if (!capable(CAP_SYS_RAWIO))
......@@ -1449,11 +1423,6 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
(iocommand.Request.Type.Direction != XFER_NONE)) {
return -EINVAL;
}
#if 0 /* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */
/* Check kmalloc limits */
if (iocommand.buf_size > 128000)
return -EINVAL;
#endif
if (iocommand.buf_size > 0) {
buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
if (buff == NULL)
......@@ -1461,8 +1430,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
}
if (iocommand.Request.Type.Direction == XFER_WRITE) {
/* Copy the data into the buffer we created */
if (copy_from_user
(buff, iocommand.buf, iocommand.buf_size)) {
if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) {
kfree(buff);
return -EFAULT;
}
......@@ -1478,12 +1446,10 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
c->cmd_type = CMD_IOCTL_PEND;
/* Fill in Command Header */
c->Header.ReplyQueue = 0; /* unused in simple mode */
if (iocommand.buf_size > 0) /* buffer to fill */
{
if (iocommand.buf_size > 0) { /* buffer to fill */
c->Header.SGList = 1;
c->Header.SGTotal = 1;
} else /* no buffers to fill */
{
} else { /* no buffers to fill */
c->Header.SGList = 0;
c->Header.SGTotal = 0;
}
......@@ -1497,8 +1463,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
/* Fill in the scatter gather information */
if (iocommand.buf_size > 0) {
temp64.val = pci_map_single(h->pdev, buff,
iocommand.buf_size,
PCI_DMA_BIDIRECTIONAL);
iocommand.buf_size, PCI_DMA_BIDIRECTIONAL);
c->SG[0].Addr.lower = temp64.val32.lower;
c->SG[0].Addr.upper = temp64.val32.upper;
c->SG[0].Len = iocommand.buf_size;
......@@ -1512,16 +1477,13 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
/* unlock the buffers from DMA */
temp64.val32.lower = c->SG[0].Addr.lower;
temp64.val32.upper = c->SG[0].Addr.upper;
pci_unmap_single(h->pdev, (dma_addr_t) temp64.val,
iocommand.buf_size,
pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size,
PCI_DMA_BIDIRECTIONAL);
check_ioctl_unit_attention(h, c);
/* Copy the error information out */
iocommand.error_info = *(c->err_info);
if (copy_to_user
(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
kfree(buff);
cmd_special_free(h, c);
return -EFAULT;
......@@ -1529,8 +1491,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
if (iocommand.Request.Type.Direction == XFER_READ) {
/* Copy the data out of the buffer we created */
if (copy_to_user
(iocommand.buf, buff, iocommand.buf_size)) {
if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) {
kfree(buff);
cmd_special_free(h, c);
return -EFAULT;
......@@ -1539,8 +1500,10 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
kfree(buff);
cmd_special_free(h, c);
return 0;
}
case CCISS_BIG_PASSTHRU:{
}
static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
{
BIG_IOCTL_Command_struct *ioc;
CommandList_struct *c;
unsigned char **buff = NULL;
......@@ -1554,7 +1517,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
__u32 sz;
BYTE __user *data_ptr;
if (!arg)
if (!argp)
return -EINVAL;
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
......@@ -1582,14 +1545,12 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
status = -EINVAL;
goto cleanup1;
}
buff =
kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
if (!buff) {
status = -ENOMEM;
goto cleanup1;
}
buff_size = kmalloc(MAXSGENTRIES * sizeof(int),
GFP_KERNEL);
buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL);
if (!buff_size) {
status = -ENOMEM;
goto cleanup1;
......@@ -1597,9 +1558,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
left = ioc->buf_size;
data_ptr = ioc->buf;
while (left) {
sz = (left >
ioc->malloc_size) ? ioc->
malloc_size : left;
sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
buff_size[sg_used] = sz;
buff[sg_used] = kmalloc(sz, GFP_KERNEL);
if (buff[sg_used] == NULL) {
......@@ -1607,8 +1566,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
goto cleanup1;
}
if (ioc->Request.Type.Direction == XFER_WRITE) {
if (copy_from_user
(buff[sg_used], data_ptr, sz)) {
if (copy_from_user(buff[sg_used], data_ptr, sz)) {
status = -EFAULT;
goto cleanup1;
}
......@@ -1626,32 +1584,20 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
}
c->cmd_type = CMD_IOCTL_PEND;
c->Header.ReplyQueue = 0;
if (ioc->buf_size > 0) {
c->Header.SGList = sg_used;
c->Header.SGTotal = sg_used;
} else {
c->Header.SGList = 0;
c->Header.SGTotal = 0;
}
c->Header.LUN = ioc->LUN_info;
c->Header.Tag.lower = c->busaddr;
c->Request = ioc->Request;
if (ioc->buf_size > 0) {
for (i = 0; i < sg_used; i++) {
temp64.val =
pci_map_single(h->pdev, buff[i],
buff_size[i],
temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i],
PCI_DMA_BIDIRECTIONAL);
c->SG[i].Addr.lower =
temp64.val32.lower;
c->SG[i].Addr.upper =
temp64.val32.upper;
c->SG[i].Addr.lower = temp64.val32.lower;
c->SG[i].Addr.upper = temp64.val32.upper;
c->SG[i].Len = buff_size[i];
c->SG[i].Ext = 0; /* we are not chaining */
}
}
c->waiting = &wait;
enqueue_cmd_and_start_io(h, c);
wait_for_completion(&wait);
......@@ -1675,8 +1621,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
/* Copy the data out of the buffer we created */
BYTE __user *ptr = ioc->buf;
for (i = 0; i < sg_used; i++) {
if (copy_to_user
(ptr, buff[i], buff_size[i])) {
if (copy_to_user(ptr, buff[i], buff_size[i])) {
cmd_special_free(h, c);
status = -EFAULT;
goto cleanup1;
......@@ -1686,7 +1631,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
}
cmd_special_free(h, c);
status = 0;
cleanup1:
cleanup1:
if (buff) {
for (i = 0; i < sg_used; i++)
kfree(buff[i]);
......@@ -1695,7 +1640,46 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
kfree(buff_size);
kfree(ioc);
return status;
}
}
static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct gendisk *disk = bdev->bd_disk;
ctlr_info_t *h = get_host(disk);
void __user *argp = (void __user *)arg;
dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
cmd, arg);
switch (cmd) {
case CCISS_GETPCIINFO:
return cciss_getpciinfo(h, argp);
case CCISS_GETINTINFO:
return cciss_getintinfo(h, argp);
case CCISS_SETINTINFO:
return cciss_setintinfo(h, argp);
case CCISS_GETNODENAME:
return cciss_getnodename(h, argp);
case CCISS_SETNODENAME:
return cciss_setnodename(h, argp);
case CCISS_GETHEARTBEAT:
return cciss_getheartbeat(h, argp);
case CCISS_GETBUSTYPES:
return cciss_getbustypes(h, argp);
case CCISS_GETFIRMVER:
return cciss_getfirmver(h, argp);
case CCISS_GETDRIVVER:
return cciss_getdrivver(h, argp);
case CCISS_DEREGDISK:
case CCISS_REGNEWD:
case CCISS_REVALIDVOLS:
return rebuild_lun_table(h, 0, 1);
case CCISS_GETLUNINFO:
return cciss_getluninfo(h, disk, argp);
case CCISS_PASSTHRU:
return cciss_passthru(h, argp);
case CCISS_BIG_PASSTHRU:
return cciss_bigpassthru(h, argp);
/* scsi_cmd_ioctl handles these, below, though some are not */
/* very meaningful for cciss. SG_IO is the main one people want. */
......
......@@ -965,29 +965,30 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
spin_lock_irqsave(&mdev->al_lock, flags);
count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
if (count) {
/* we need the lock for drbd_try_clear_on_disk_bm */
if (jiffies - mdev->rs_mark_time > HZ*10) {
/* should be rolling marks,
* but we estimate only anyways. */
if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
if (count && get_ldev(mdev)) {
unsigned long now = jiffies;
unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
unsigned long tw = drbd_bm_total_weight(mdev);
if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
mdev->state.conn != C_PAUSED_SYNC_T &&
mdev->state.conn != C_PAUSED_SYNC_S) {
mdev->rs_mark_time = jiffies;
mdev->rs_mark_left = drbd_bm_total_weight(mdev);
mdev->rs_mark_time[next] = now;
mdev->rs_mark_left[next] = tw;
mdev->rs_last_mark = next;
}
}
if (get_ldev(mdev)) {
spin_lock_irqsave(&mdev->al_lock, flags);
drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
put_ldev(mdev);
}
spin_unlock_irqrestore(&mdev->al_lock, flags);
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
put_ldev(mdev);
}
spin_unlock_irqrestore(&mdev->al_lock, flags);
if (wake_up)
wake_up(&mdev->al_wait);
}
......@@ -1118,7 +1119,7 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
* @mdev: DRBD device.
* @sector: The sector number.
*
* This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
* This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
*/
int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
{
......@@ -1129,10 +1130,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
sig = wait_event_interruptible(mdev->al_wait,
(bm_ext = _bme_get(mdev, enr)));
if (sig)
return 0;
return -EINTR;
if (test_bit(BME_LOCKED, &bm_ext->flags))
return 1;
return 0;
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
sig = wait_event_interruptible(mdev->al_wait,
......@@ -1145,13 +1146,11 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
wake_up(&mdev->al_wait);
}
spin_unlock_irq(&mdev->al_lock);
return 0;
return -EINTR;
}
}
set_bit(BME_LOCKED, &bm_ext->flags);
return 1;
return 0;
}
/**
......
......@@ -569,7 +569,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
*
* maybe bm_set should be atomic_t ?
*/
static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long s;
......
......@@ -337,13 +337,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
* NOTE that the payload starts at a long aligned offset,
* regardless of 32 or 64 bit arch!
*/
struct p_header {
struct p_header80 {
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
u8 payload[0];
} __packed;
/* 8 bytes. packet FIXED for the next century! */
/* Header for big packets, Used for data packets exceeding 64kB */
struct p_header95 {
u16 magic; /* use DRBD_MAGIC_BIG here */
u16 command;
u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
u8 payload[0];
} __packed;
union p_header {
struct p_header80 h80;
struct p_header95 h95;
};
/*
* short commands, packets without payload, plain p_header:
......@@ -362,12 +374,16 @@ struct p_header {
*/
/* these defines must not be changed without changing the protocol version */
#define DP_HARDBARRIER 1
#define DP_RW_SYNC 2
#define DP_HARDBARRIER 1 /* depricated */
#define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4
#define DP_UNPLUG 8 /* equals REQ_UNPLUG */
#define DP_FUA 16 /* equals REQ_FUA */
#define DP_FLUSH 32 /* equals REQ_FLUSH */
#define DP_DISCARD 64 /* equals REQ_DISCARD */
struct p_data {
struct p_header head;
union p_header head;
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
......@@ -383,7 +399,7 @@ struct p_data {
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
struct p_header head;
struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
......@@ -392,7 +408,7 @@ struct p_block_ack {
struct p_block_req {
struct p_header head;
struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
......@@ -409,7 +425,7 @@ struct p_block_req {
*/
struct p_handshake {
struct p_header head; /* 8 bytes */
struct p_header80 head; /* 8 bytes */
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
......@@ -424,19 +440,19 @@ struct p_handshake {
/* 80 bytes, FIXED for the next century */
struct p_barrier {
struct p_header head;
struct p_header80 head;
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
struct p_header head;
struct p_header80 head;
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
struct p_header head;
struct p_header80 head;
u32 rate;
/* Since protocol version 88 and higher. */
......@@ -444,20 +460,31 @@ struct p_rs_param {
} __packed;
struct p_rs_param_89 {
struct p_header head;
struct p_header80 head;
u32 rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
struct p_rs_param_95 {
struct p_header80 head;
u32 rate;
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
u32 c_plan_ahead;
u32 c_delay_target;
u32 c_fill_target;
u32 c_max_rate;
} __packed;
enum drbd_conn_flags {
CF_WANT_LOSE = 1,
CF_DRY_RUN = 2,
};
struct p_protocol {
struct p_header head;
struct p_header80 head;
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
......@@ -471,17 +498,17 @@ struct p_protocol {
} __packed;
struct p_uuids {
struct p_header head;
struct p_header80 head;
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
struct p_header head;
struct p_header80 head;
u64 uuid;
} __packed;
struct p_sizes {
struct p_header head;
struct p_header80 head;
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
......@@ -491,18 +518,18 @@ struct p_sizes {
} __packed;
struct p_state {
struct p_header head;
struct p_header80 head;
u32 state;
} __packed;
struct p_req_state {
struct p_header head;
struct p_header80 head;
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
struct p_header head;
struct p_header80 head;
u32 retcode;
} __packed;
......@@ -517,7 +544,7 @@ struct p_drbd06_param {
} __packed;
struct p_discard {
struct p_header head;
struct p_header80 head;
u64 block_id;
u32 seq_num;
u32 pad;
......@@ -533,7 +560,7 @@ enum drbd_bitmap_code {
};
struct p_compressed_bm {
struct p_header head;
struct p_header80 head;
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
......@@ -544,8 +571,8 @@ struct p_compressed_bm {
u8 code[0];
} __packed;
struct p_delay_probe {
struct p_header head;
struct p_delay_probe93 {
struct p_header80 head;
u32 seq_num; /* sequence number to match the two probe packets */
u32 offset; /* usecs the probe got sent after the reference time point */
} __packed;
......@@ -594,7 +621,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
* so we need to use the fixed size 4KiB page size
* most architechtures have used for a long time.
*/
#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
#if (PAGE_SIZE < 4096)
......@@ -603,13 +630,14 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
#endif
union p_polymorph {
struct p_header header;
union p_header header;
struct p_handshake handshake;
struct p_data data;
struct p_block_ack block_ack;
struct p_barrier barrier;
struct p_barrier_ack barrier_ack;
struct p_rs_param_89 rs_param_89;
struct p_rs_param_95 rs_param_95;
struct p_protocol protocol;
struct p_sizes sizes;
struct p_uuids uuids;
......@@ -617,6 +645,8 @@ union p_polymorph {
struct p_req_state req_state;
struct p_req_state_reply req_state_reply;
struct p_block_req block_req;
struct p_delay_probe93 delay_probe93;
struct p_rs_uuid rs_uuid;
} __packed;
/**********************************************************************/
......@@ -697,7 +727,7 @@ struct drbd_tl_epoch {
struct list_head requests; /* requests before */
struct drbd_tl_epoch *next; /* pointer to the next barrier */
unsigned int br_number; /* the barriers identifier. */
int n_req; /* number of requests attached before this barrier */
int n_writes; /* number of requests attached before this barrier */
};
struct drbd_request;
......@@ -747,7 +777,7 @@ struct digest_info {
struct drbd_epoch_entry {
struct drbd_work w;
struct hlist_node colision;
struct drbd_epoch *epoch;
struct drbd_epoch *epoch; /* for writes */
struct drbd_conf *mdev;
struct page *pages;
atomic_t pending_bios;
......@@ -755,7 +785,10 @@ struct drbd_epoch_entry {
/* see comments on ee flag bits below */
unsigned long flags;
sector_t sector;
union {
u64 block_id;
struct digest_info *digest;
};
};
/* ee flag bits.
......@@ -781,12 +814,16 @@ enum {
* if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
/* global flag bits */
enum {
......@@ -794,7 +831,6 @@ enum {
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
SEND_PING, /* whether asender should send a ping asap */
STOP_SYNC_TIMER, /* tell timer to cancel itself */
UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
......@@ -816,6 +852,7 @@ enum {
BITMAP_IO, /* suspend application io;
once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */
GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
NET_CONGESTED, /* The data socket is congested */
......@@ -829,6 +866,8 @@ enum {
* the peer, if it changed there as well. */
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
NEW_CUR_UUID, /* Create new current UUID when thawing IO */
AL_SUSPENDED, /* Activity logging is currently suspended. */
};
struct drbd_bitmap; /* opaque for drbd_conf */
......@@ -838,10 +877,6 @@ struct drbd_bitmap; /* opaque for drbd_conf */
/* THINK maybe we actually want to use the default "event/%s" worker threads
* or similar in linux 2.6, which uses per cpu data and threads.
*
* To be general, this might need a spin_lock member.
* For now, please use the mdev->req_lock to protect list_head,
* see drbd_queue_work below.
*/
struct drbd_work_queue {
struct list_head q;
......@@ -915,6 +950,12 @@ enum write_ordering_e {
WO_bio_barrier
};
struct fifo_buffer {
int *values;
unsigned int head_index;
unsigned int size;
};
struct drbd_conf {
/* things that are stored as / read from meta data on disk */
unsigned long flags;
......@@ -936,9 +977,16 @@ struct drbd_conf {
unsigned int ko_count;
struct drbd_work resync_work,
unplug_work,
go_diskless,
md_sync_work;
struct timer_list resync_timer;
struct timer_list md_sync_timer;
#ifdef DRBD_DEBUG_MD_SYNC
struct {
unsigned int line;
const char* func;
} last_md_mark_dirty;
#endif
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
......@@ -946,6 +994,7 @@ struct drbd_conf {
union drbd_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
wait_queue_head_t net_cnt_wait;
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
......@@ -974,12 +1023,16 @@ struct drbd_conf {
unsigned long rs_start;
/* cumulated time in PausedSyncX state [unit jiffies] */
unsigned long rs_paused;
/* skipped because csum was equal [unit BM_BLOCK_SIZE] */
unsigned long rs_same_csum;
#define DRBD_SYNC_MARKS 8
#define DRBD_SYNC_MARK_STEP (3*HZ)
/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
unsigned long rs_mark_left;
unsigned long rs_mark_left[DRBD_SYNC_MARKS];
/* marks's time [unit jiffies] */
unsigned long rs_mark_time;
/* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
unsigned long rs_same_csum;
unsigned long rs_mark_time[DRBD_SYNC_MARKS];
/* current index into rs_mark_{left,time} */
int rs_last_mark;
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
......@@ -1012,10 +1065,10 @@ struct drbd_conf {
spinlock_t epoch_lock;
unsigned int epochs;
enum write_ordering_e write_ordering;
struct list_head active_ee; /* IO in progress */
struct list_head sync_ee; /* IO in progress */
struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
struct list_head done_ee; /* send ack */
struct list_head read_ee; /* IO in progress */
struct list_head read_ee; /* IO in progress (any read) */
struct list_head net_ee; /* zero-copy network send in progress */
struct hlist_head *ee_hash; /* is proteced by req_lock! */
unsigned int ee_hash_s;
......@@ -1026,7 +1079,8 @@ struct drbd_conf {
int next_barrier_nr;
struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
atomic_t pp_in_use;
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
struct page *md_io_tmpp; /* for logical_block_size != 512 */
......@@ -1054,6 +1108,15 @@ struct drbd_conf {
u64 ed_uuid; /* UUID of the exposed data */
struct mutex state_mutex;
char congestion_reason; /* Why we where congested... */
atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
atomic_t rs_sect_ev; /* for submitted resync data rate, both */
int rs_last_sect_ev; /* counter to compare with */
int rs_last_events; /* counter of read or write "events" (unit sectors)
* on the lower level device when we last looked. */
int c_sync_rate; /* current resync rate after syncer throttle magic */
struct fifo_buffer rs_plan_s; /* correction values of resync planer */
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
int rs_planed; /* resync sectors already planed */
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
......@@ -1138,6 +1201,8 @@ extern void drbd_free_resources(struct drbd_conf *mdev);
extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
unsigned int set_size);
extern void tl_clear(struct drbd_conf *mdev);
enum drbd_req_event;
extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
extern void drbd_free_sock(struct drbd_conf *mdev);
extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
......@@ -1150,12 +1215,12 @@ extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_f
extern int _drbd_send_state(struct drbd_conf *mdev);
extern int drbd_send_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
enum drbd_packets cmd, struct p_header *h,
enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags);
#define USE_DATA_SOCKET 1
#define USE_META_SOCKET 0
extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
enum drbd_packets cmd, struct p_header *h,
enum drbd_packets cmd, struct p_header80 *h,
size_t size);
extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
char *data, size_t size);
......@@ -1167,7 +1232,7 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_block_req *rp);
extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_data *dp);
struct p_data *dp, int data_size);
extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
sector_t sector, int blksize, u64 block_id);
extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
......@@ -1201,7 +1266,13 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
#ifndef DRBD_DEBUG_MD_SYNC
extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
#else
#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
unsigned int line, const char *func);
#endif
extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
void (*done)(struct drbd_conf *, int),
......@@ -1209,6 +1280,7 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
extern void drbd_go_diskless(struct drbd_conf *mdev);
/* Meta data layout
......@@ -1264,6 +1336,8 @@ struct bm_extent {
* Bit 1 ==> local node thinks this block needs to be synced.
*/
#define SLEEP_TIME (HZ/10)
#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
......@@ -1335,11 +1409,13 @@ struct bm_extent {
#endif
/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
* With a value of 6 all IO in one 32K block make it to the same slot of the
* With a value of 8 all IO in one 128K block make it to the same slot of the
* hash table. */
#define HT_SHIFT 6
#define HT_SHIFT 8
#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
/* Number of elements in the app_reads_hash */
#define APP_R_HSIZE 15
......@@ -1369,6 +1445,7 @@ extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_
/* bm_find_next variants for use while you hold drbd_bm_lock() */
extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
extern int drbd_bm_rs_done(struct drbd_conf *mdev);
/* for receive_bitmap */
......@@ -1421,7 +1498,8 @@ extern void resync_after_online_grow(struct drbd_conf *);
extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
int force);
enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
/* drbd_worker.c */
......@@ -1467,10 +1545,12 @@ extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
const unsigned rw, const int fault_type);
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
......@@ -1479,7 +1559,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
sector_t sector,
unsigned int data_size,
gfp_t gfp_mask) __must_hold(local);
extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
int is_net);
#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
struct list_head *head);
extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
......@@ -1487,6 +1570,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
extern void drbd_flush_workqueue(struct drbd_conf *mdev);
extern void drbd_free_tl_hash(struct drbd_conf *mdev);
/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
* mess with get_fs/set_fs, we know we are KERNEL_DS always. */
......@@ -1600,6 +1684,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
#define susp_nod_MASK 1
#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
......@@ -1855,13 +1941,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
}
}
static inline void
_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
{
list_add_tail(&w->list, &q->q);
up(&q->s);
}
static inline void
drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
{
......@@ -1899,19 +1978,19 @@ static inline void request_ping(struct drbd_conf *mdev)
static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
enum drbd_packets cmd)
{
struct p_header h;
struct p_header80 h;
return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
}
static inline int drbd_send_ping(struct drbd_conf *mdev)
{
struct p_header h;
struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
}
static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
{
struct p_header h;
struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
}
......@@ -2013,7 +2092,7 @@ static inline void inc_unacked(struct drbd_conf *mdev)
static inline void put_net_conf(struct drbd_conf *mdev)
{
if (atomic_dec_and_test(&mdev->net_cnt))
wake_up(&mdev->misc_wait);
wake_up(&mdev->net_cnt_wait);
}
/**
......@@ -2044,10 +2123,14 @@ static inline int get_net_conf(struct drbd_conf *mdev)
static inline void put_ldev(struct drbd_conf *mdev)
{
int i = atomic_dec_return(&mdev->local_cnt);
__release(local);
if (atomic_dec_and_test(&mdev->local_cnt))
D_ASSERT(i >= 0);
if (i == 0) {
if (mdev->state.disk == D_FAILED)
drbd_go_diskless(mdev);
wake_up(&mdev->misc_wait);
D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
}
}
#ifndef __CHECKER__
......@@ -2179,11 +2262,16 @@ static inline int drbd_state_is_stable(union drbd_state s)
return 1;
}
static inline int is_susp(union drbd_state s)
{
return s.susp || s.susp_nod || s.susp_fen;
}
static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
if (mdev->state.susp)
if (is_susp(mdev->state))
return 0;
if (test_bit(SUSPEND_IO, &mdev->flags))
return 0;
......
......@@ -78,6 +78,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
static void md_sync_timer_fn(unsigned long data);
static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
"Lars Ellenberg <lars@linbit.com>");
......@@ -200,7 +201,7 @@ static int tl_init(struct drbd_conf *mdev)
INIT_LIST_HEAD(&b->w.list);
b->next = NULL;
b->br_number = 4711;
b->n_req = 0;
b->n_writes = 0;
b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
mdev->oldest_tle = b;
......@@ -241,7 +242,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
INIT_LIST_HEAD(&new->w.list);
new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
new->next = NULL;
new->n_req = 0;
new->n_writes = 0;
newest_before = mdev->newest_tle;
/* never send a barrier number == 0, because that is special-cased
......@@ -285,9 +286,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
barrier_nr, b->br_number);
goto bail;
}
if (b->n_req != set_size) {
dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
barrier_nr, set_size, b->n_req);
if (b->n_writes != set_size) {
dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
barrier_nr, set_size, b->n_writes);
goto bail;
}
......@@ -334,34 +335,51 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
}
/**
* tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
* _tl_restart() - Walks the transfer log, and applies an action to all requests
* @mdev: DRBD device.
* @what: The action/event to perform with all request objects
*
* This is called after the connection to the peer was lost. The storage covered
* by the requests on the transfer gets marked as our of sync. Called from the
* receiver thread and the worker thread.
* @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
* restart_frozen_disk_io.
*/
void tl_clear(struct drbd_conf *mdev)
static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
{
struct drbd_tl_epoch *b, *tmp;
struct list_head *le, *tle;
struct drbd_request *r;
int new_initial_bnr = net_random();
spin_lock_irq(&mdev->req_lock);
struct drbd_tl_epoch *b, *tmp, **pn;
struct list_head *le, *tle, carry_reads;
struct drbd_request *req;
int rv, n_writes, n_reads;
b = mdev->oldest_tle;
pn = &mdev->oldest_tle;
while (b) {
n_writes = 0;
n_reads = 0;
INIT_LIST_HEAD(&carry_reads);
list_for_each_safe(le, tle, &b->requests) {
r = list_entry(le, struct drbd_request, tl_requests);
/* It would be nice to complete outside of spinlock.
* But this is easier for now. */
_req_mod(r, connection_lost_while_pending);
req = list_entry(le, struct drbd_request, tl_requests);
rv = _req_mod(req, what);
n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
}
tmp = b->next;
if (n_writes) {
if (what == resend) {
b->n_writes = n_writes;
if (b->w.cb == NULL) {
b->w.cb = w_send_barrier;
inc_ap_pending(mdev);
set_bit(CREATE_BARRIER, &mdev->flags);
}
drbd_queue_work(&mdev->data.work, &b->w);
}
pn = &b->next;
} else {
if (n_reads)
list_add(&carry_reads, &b->requests);
/* there could still be requests on that ring list,
* in case local io is still pending */
list_del(&b->requests);
......@@ -376,17 +394,40 @@ void tl_clear(struct drbd_conf *mdev)
/* recycle, but reinit! */
D_ASSERT(tmp == NULL);
INIT_LIST_HEAD(&b->requests);
list_splice(&carry_reads, &b->requests);
INIT_LIST_HEAD(&b->w.list);
b->w.cb = NULL;
b->br_number = new_initial_bnr;
b->n_req = 0;
b->br_number = net_random();
b->n_writes = 0;
mdev->oldest_tle = b;
*pn = b;
break;
}
*pn = tmp;
kfree(b);
}
b = tmp;
list_splice(&carry_reads, &b->requests);
}
}
/**
* tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
* @mdev: DRBD device.
*
* This is called after the connection to the peer was lost. The storage covered
* by the requests on the transfer gets marked as our of sync. Called from the
* receiver thread and the worker thread.
*/
void tl_clear(struct drbd_conf *mdev)
{
struct list_head *le, *tle;
struct drbd_request *r;
spin_lock_irq(&mdev->req_lock);
_tl_restart(mdev, connection_lost_while_pending);
/* we expect this list to be empty. */
D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
......@@ -402,6 +443,15 @@ void tl_clear(struct drbd_conf *mdev)
/* ensure bit indicating barrier is required is clear */
clear_bit(CREATE_BARRIER, &mdev->flags);
memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
spin_unlock_irq(&mdev->req_lock);
}
void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
{
spin_lock_irq(&mdev->req_lock);
_tl_restart(mdev, what);
spin_unlock_irq(&mdev->req_lock);
}
......@@ -456,7 +506,7 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
static int is_valid_state_transition(struct drbd_conf *,
union drbd_state, union drbd_state);
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
union drbd_state ns, int *warn_sync_abort);
union drbd_state ns, const char **warn_sync_abort);
int drbd_send_state_req(struct drbd_conf *,
union drbd_state, union drbd_state);
......@@ -606,7 +656,7 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
drbd_role_str(ns.peer),
drbd_disk_str(ns.disk),
drbd_disk_str(ns.pdsk),
ns.susp ? 's' : 'r',
is_susp(ns) ? 's' : 'r',
ns.aftr_isp ? 'a' : '-',
ns.peer_isp ? 'p' : '-',
ns.user_isp ? 'u' : '-'
......@@ -764,7 +814,7 @@ static int is_valid_state_transition(struct drbd_conf *mdev,
* to D_UNKNOWN. This rule and many more along those lines are in this function.
*/
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
union drbd_state ns, int *warn_sync_abort)
union drbd_state ns, const char **warn_sync_abort)
{
enum drbd_fencing_p fp;
......@@ -779,9 +829,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
os.conn <= C_DISCONNECTING)
ns.conn = os.conn;
/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
* If you try to go into some Sync* state, that shall fail (elsewhere). */
if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
ns.conn = os.conn;
/* After C_DISCONNECTING only C_STANDALONE may follow */
......@@ -799,14 +850,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
ns.aftr_isp = 0;
if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
ns.pdsk = D_UNKNOWN;
/* Abort resync if a disk fails/detaches */
if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
(ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
if (warn_sync_abort)
*warn_sync_abort = 1;
*warn_sync_abort =
os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
"Online-verify" : "Resync";
ns.conn = C_CONNECTED;
}
......@@ -877,7 +927,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
if (fp == FP_STONITH &&
(ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
!(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
ns.susp = 1;
ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
(ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
if (ns.conn == C_SYNC_SOURCE)
......@@ -913,6 +968,12 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
}
}
static void drbd_resume_al(struct drbd_conf *mdev)
{
if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
dev_info(DEV, "Resumed AL updates\n");
}
/**
* __drbd_set_state() - Set a new DRBD state
* @mdev: DRBD device.
......@@ -928,7 +989,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
{
union drbd_state os;
int rv = SS_SUCCESS;
int warn_sync_abort = 0;
const char *warn_sync_abort = NULL;
struct after_state_chg_work *ascw;
os = mdev->state;
......@@ -947,14 +1008,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
/* If the old state was illegal as well, then let
this happen...*/
if (is_valid_state(mdev, os) == rv) {
dev_err(DEV, "Considering state change from bad state. "
"Error would be: '%s'\n",
drbd_set_st_err_str(rv));
print_st(mdev, "old", os);
print_st(mdev, "new", ns);
if (is_valid_state(mdev, os) == rv)
rv = is_valid_state_transition(mdev, ns, os);
}
} else
rv = is_valid_state_transition(mdev, ns, os);
}
......@@ -966,7 +1021,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
}
if (warn_sync_abort)
dev_warn(DEV, "Resync aborted.\n");
dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
{
char *pbp, pb[300];
......@@ -977,7 +1032,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
PSC(conn);
PSC(disk);
PSC(pdsk);
PSC(susp);
if (is_susp(ns) != is_susp(os))
pbp += sprintf(pbp, "susp( %s -> %s ) ",
drbd_susp_str(is_susp(os)),
drbd_susp_str(is_susp(ns)));
PSC(aftr_isp);
PSC(peer_isp);
PSC(user_isp);
......@@ -1002,12 +1060,6 @@ int __drbd_set_state(struct drbd_conf *mdev,
wake_up(&mdev->misc_wait);
wake_up(&mdev->state_wait);
/* post-state-change actions */
if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
set_bit(STOP_SYNC_TIMER, &mdev->flags);
mod_timer(&mdev->resync_timer, jiffies);
}
/* aborted verify run. log the last position */
if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
ns.conn < C_CONNECTED) {
......@@ -1020,41 +1072,42 @@ int __drbd_set_state(struct drbd_conf *mdev,
if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
(ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
dev_info(DEV, "Syncer continues.\n");
mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
if (ns.conn == C_SYNC_TARGET) {
if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
mdev->rs_paused += (long)jiffies
-(long)mdev->rs_mark_time[mdev->rs_last_mark];
if (ns.conn == C_SYNC_TARGET)
mod_timer(&mdev->resync_timer, jiffies);
/* This if (!test_bit) is only needed for the case
that a device that has ceased to used its timer,
i.e. it is already in drbd_resync_finished() gets
paused and resumed. */
}
}
if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
(ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
dev_info(DEV, "Resync suspended\n");
mdev->rs_mark_time = jiffies;
if (ns.conn == C_PAUSED_SYNC_T)
set_bit(STOP_SYNC_TIMER, &mdev->flags);
mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
}
if (os.conn == C_CONNECTED &&
(ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
unsigned long now = jiffies;
int i;
mdev->ov_position = 0;
mdev->rs_total =
mdev->rs_mark_left = drbd_bm_bits(mdev);
mdev->rs_total = drbd_bm_bits(mdev);
if (mdev->agreed_pro_version >= 90)
set_ov_position(mdev, ns.conn);
else
mdev->ov_start_sector = 0;
mdev->ov_left = mdev->rs_total
- BM_SECT_TO_BIT(mdev->ov_position);
mdev->rs_start =
mdev->rs_mark_time = jiffies;
mdev->rs_start = now;
mdev->rs_last_events = 0;
mdev->rs_last_sect_ev = 0;
mdev->ov_last_oos_size = 0;
mdev->ov_last_oos_start = 0;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
mdev->rs_mark_left[i] = mdev->rs_total;
mdev->rs_mark_time[i] = now;
}
if (ns.conn == C_VERIFY_S) {
dev_info(DEV, "Starting Online Verify from sector %llu\n",
(unsigned long long)mdev->ov_position);
......@@ -1107,6 +1160,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
drbd_thread_restart_nowait(&mdev->receiver);
/* Resume AL writing if we get a connection */
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
drbd_resume_al(mdev);
ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
if (ascw) {
ascw->os = os;
......@@ -1165,6 +1222,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
union drbd_state ns, enum chg_state_flags flags)
{
enum drbd_fencing_p fp;
enum drbd_req_event what = nothing;
union drbd_state nsm = (union drbd_state){ .i = -1 };
if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
clear_bit(CRASHED_PRIMARY, &mdev->flags);
......@@ -1188,17 +1247,49 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
/* Here we have the actions that are performed after a
state change. This function might sleep */
if (fp == FP_STONITH && ns.susp) {
/* case1: The outdate peer handler is successful:
* case2: The connection was established again: */
if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
(os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
nsm.i = -1;
if (ns.susp_nod) {
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
if (ns.conn == C_CONNECTED)
what = resend, nsm.susp_nod = 0;
else /* ns.conn > C_CONNECTED */
dev_err(DEV, "Unexpected Resynd going on!\n");
}
if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
what = restart_frozen_disk_io, nsm.susp_nod = 0;
}
if (ns.susp_fen) {
/* case1: The outdate peer handler is successful: */
if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
tl_clear(mdev);
if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
drbd_uuid_new_current(mdev);
clear_bit(NEW_CUR_UUID, &mdev->flags);
drbd_md_sync(mdev);
}
spin_lock_irq(&mdev->req_lock);
_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
}
/* case2: The connection was established again: */
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
clear_bit(NEW_CUR_UUID, &mdev->flags);
what = resend;
nsm.susp_fen = 0;
}
}
if (what != nothing) {
spin_lock_irq(&mdev->req_lock);
_tl_restart(mdev, what);
nsm.i &= mdev->state.i;
_drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
}
/* Do not change the order of the if above and the two below... */
if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
drbd_send_uuids(mdev);
......@@ -1217,16 +1308,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
if (get_ldev(mdev)) {
if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
if (is_susp(mdev->state)) {
set_bit(NEW_CUR_UUID, &mdev->flags);
} else {
drbd_uuid_new_current(mdev);
drbd_send_uuids(mdev);
}
}
put_ldev(mdev);
}
}
if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
drbd_uuid_new_current(mdev);
drbd_send_uuids(mdev);
}
/* D_DISKLESS Peer becomes secondary */
if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
......@@ -1268,42 +1365,51 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
/* first half of local IO error */
if (os.disk > D_FAILED && ns.disk == D_FAILED) {
enum drbd_io_error_p eh;
enum drbd_io_error_p eh = EP_PASS_ON;
if (drbd_send_state(mdev))
dev_warn(DEV, "Notified peer that my disk is broken.\n");
else
dev_err(DEV, "Sending state for drbd_io_error() failed\n");
drbd_rs_cancel_all(mdev);
eh = EP_PASS_ON;
if (get_ldev_if_state(mdev, D_FAILED)) {
eh = mdev->ldev->dc.on_io_error;
put_ldev(mdev);
}
if (eh == EP_CALL_HELPER)
drbd_khelper(mdev, "local-io-error");
}
drbd_rs_cancel_all(mdev);
/* since get_ldev() only works as long as disk>=D_INCONSISTENT,
and it is D_DISKLESS here, local_cnt can only go down, it can
not increase... It will reach zero */
wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
/* second half of local IO error handling,
* after local_cnt references have reached zero: */
if (os.disk == D_FAILED && ns.disk == D_DISKLESS) {
mdev->rs_total = 0;
mdev->rs_failed = 0;
atomic_set(&mdev->rs_pending_cnt, 0);
spin_lock_irq(&mdev->req_lock);
_drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
spin_unlock_irq(&mdev->req_lock);
if (eh == EP_CALL_HELPER)
drbd_khelper(mdev, "local-io-error");
}
if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
/* We must still be diskless,
* re-attach has to be serialized with this! */
if (mdev->state.disk != D_DISKLESS)
dev_err(DEV,
"ASSERT FAILED: disk is %s while going diskless\n",
drbd_disk_str(mdev->state.disk));
/* we cannot assert local_cnt == 0 here, as get_ldev_if_state
* will inc/dec it frequently. Since we became D_DISKLESS, no
* one has touched the protected members anymore, though, so we
* are safe to free them here. */
if (drbd_send_state(mdev))
dev_warn(DEV, "Notified peer that my disk is broken.\n");
dev_warn(DEV, "Notified peer that I detached my disk.\n");
else
dev_err(DEV, "Sending state in drbd_io_error() failed\n");
}
dev_err(DEV, "Sending state for detach failed\n");
wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
lc_destroy(mdev->resync);
mdev->resync = NULL;
lc_destroy(mdev->act_log);
......@@ -1312,8 +1418,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
drbd_free_bc(mdev->ldev);
mdev->ldev = NULL;);
if (mdev->md_io_tmpp)
if (mdev->md_io_tmpp) {
__free_page(mdev->md_io_tmpp);
mdev->md_io_tmpp = NULL;
}
}
/* Disks got bigger while they were detached */
......@@ -1329,6 +1437,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
(os.user_isp && !ns.user_isp))
resume_next_sg(mdev);
/* sync target done with resync. Explicitly notify peer, even though
* it should (at least for non-empty resyncs) already know itself. */
if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
drbd_send_state(mdev);
/* free tl_hash if we Got thawed and are C_STANDALONE */
if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
drbd_free_tl_hash(mdev);
/* Upon network connection, we need to start the receiver */
if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
drbd_thread_start(&mdev->receiver);
......@@ -1555,7 +1672,7 @@ void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
/* the appropriate socket mutex must be held already */
int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
enum drbd_packets cmd, struct p_header *h,
enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags)
{
int sent, ok;
......@@ -1565,7 +1682,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
h->magic = BE_DRBD_MAGIC;
h->command = cpu_to_be16(cmd);
h->length = cpu_to_be16(size-sizeof(struct p_header));
h->length = cpu_to_be16(size-sizeof(struct p_header80));
sent = drbd_send(mdev, sock, h, size, msg_flags);
......@@ -1580,7 +1697,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
* when we hold the appropriate socket mutex.
*/
int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
enum drbd_packets cmd, struct p_header *h, size_t size)
enum drbd_packets cmd, struct p_header80 *h, size_t size)
{
int ok = 0;
struct socket *sock;
......@@ -1608,7 +1725,7 @@ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
size_t size)
{
struct p_header h;
struct p_header80 h;
int ok;
h.magic = BE_DRBD_MAGIC;
......@@ -1630,7 +1747,7 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
{
struct p_rs_param_89 *p;
struct p_rs_param_95 *p;
struct socket *sock;
int size, rv;
const int apv = mdev->agreed_pro_version;
......@@ -1638,7 +1755,8 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
size = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
+ strlen(mdev->sync_conf.verify_alg) + 1
: /* 89 */ sizeof(struct p_rs_param_89);
: apv <= 94 ? sizeof(struct p_rs_param_89)
: /* apv >= 95 */ sizeof(struct p_rs_param_95);
/* used from admin command context and receiver/worker context.
* to avoid kmalloc, grab the socket right here,
......@@ -1649,12 +1767,16 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
if (likely(sock != NULL)) {
enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
p = &mdev->data.sbuf.rs_param_89;
p = &mdev->data.sbuf.rs_param_95;
/* initialize verify_alg and csums_alg */
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
p->rate = cpu_to_be32(sc->rate);
p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
p->c_delay_target = cpu_to_be32(sc->c_delay_target);
p->c_fill_target = cpu_to_be32(sc->c_fill_target);
p->c_max_rate = cpu_to_be32(sc->c_max_rate);
if (apv >= 88)
strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
......@@ -1710,7 +1832,7 @@ int drbd_send_protocol(struct drbd_conf *mdev)
strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
(struct p_header *)p, size);
(struct p_header80 *)p, size);
kfree(p);
return rv;
}
......@@ -1736,7 +1858,7 @@ int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
put_ldev(mdev);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
}
int drbd_send_uuids(struct drbd_conf *mdev)
......@@ -1757,7 +1879,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
p.uuid = cpu_to_be64(val);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
}
int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
......@@ -1787,7 +1909,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
p.dds_flags = cpu_to_be16(flags);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
return ok;
}
......@@ -1812,7 +1934,7 @@ int drbd_send_state(struct drbd_conf *mdev)
if (likely(sock != NULL)) {
ok = _drbd_send_cmd(mdev, sock, P_STATE,
(struct p_header *)&p, sizeof(p), 0);
(struct p_header80 *)&p, sizeof(p), 0);
}
mutex_unlock(&mdev->data.mutex);
......@@ -1830,7 +1952,7 @@ int drbd_send_state_req(struct drbd_conf *mdev,
p.val = cpu_to_be32(val.i);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
}
int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
......@@ -1840,7 +1962,7 @@ int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
p.retcode = cpu_to_be32(retcode);
return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
}
int fill_bitmap_rle_bits(struct drbd_conf *mdev,
......@@ -1939,7 +2061,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
enum { OK, FAILED, DONE }
send_bitmap_rle_or_plain(struct drbd_conf *mdev,
struct p_header *h, struct bm_xfer_ctx *c)
struct p_header80 *h, struct bm_xfer_ctx *c)
{
struct p_compressed_bm *p = (void*)h;
unsigned long num_words;
......@@ -1969,12 +2091,12 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
if (len)
drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
h, sizeof(struct p_header) + len, 0);
h, sizeof(struct p_header80) + len, 0);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
c->packets[1]++;
c->bytes[1] += sizeof(struct p_header) + len;
c->bytes[1] += sizeof(struct p_header80) + len;
if (c->bit_offset > c->bm_bits)
c->bit_offset = c->bm_bits;
......@@ -1990,14 +2112,14 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
int _drbd_send_bitmap(struct drbd_conf *mdev)
{
struct bm_xfer_ctx c;
struct p_header *p;
struct p_header80 *p;
int ret;
ERR_IF(!mdev->bitmap) return FALSE;
/* maybe we should use some per thread scratch page,
* and allocate that during initial device creation? */
p = (struct p_header *) __get_free_page(GFP_NOIO);
p = (struct p_header80 *) __get_free_page(GFP_NOIO);
if (!p) {
dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
return FALSE;
......@@ -2055,7 +2177,7 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
if (mdev->state.conn < C_CONNECTED)
return FALSE;
ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
return ok;
}
......@@ -2083,17 +2205,18 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
return FALSE;
ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
return ok;
}
/* dp->sector and dp->block_id already/still in network byte order,
* data_size is payload size according to dp->head,
* and may need to be corrected for digest size. */
int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_data *dp)
struct p_data *dp, int data_size)
{
const int header_size = sizeof(struct p_data)
- sizeof(struct p_header);
int data_size = ((struct p_header *)dp)->length - header_size;
data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
dp->block_id);
}
......@@ -2141,7 +2264,7 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
p.blksize = cpu_to_be32(size);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
return ok;
}
......@@ -2159,7 +2282,7 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev,
p.head.magic = BE_DRBD_MAGIC;
p.head.command = cpu_to_be16(cmd);
p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
mutex_lock(&mdev->data.mutex);
......@@ -2181,7 +2304,7 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
p.blksize = cpu_to_be32(size);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
(struct p_header *)&p, sizeof(p));
(struct p_header80 *)&p, sizeof(p));
return ok;
}
......@@ -2333,6 +2456,18 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
return 1;
}
static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
{
if (mdev->agreed_pro_version >= 95)
return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
(bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
(bi_rw & REQ_FUA ? DP_FUA : 0) |
(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
else
return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
}
/* Used to send write requests
* R_PRIMARY -> Peer (P_DATA)
*/
......@@ -2350,30 +2485,25 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
p.head.magic = BE_DRBD_MAGIC;
p.head.command = cpu_to_be16(P_DATA);
p.head.length =
cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
p.head.h80.magic = BE_DRBD_MAGIC;
p.head.h80.command = cpu_to_be16(P_DATA);
p.head.h80.length =
cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
} else {
p.head.h95.magic = BE_DRBD_MAGIC_BIG;
p.head.h95.command = cpu_to_be16(P_DATA);
p.head.h95.length =
cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
}
p.sector = cpu_to_be64(req->sector);
p.block_id = (unsigned long)req;
p.seq_num = cpu_to_be32(req->seq_num =
atomic_add_return(1, &mdev->packet_seq));
dp_flags = 0;
/* NOTE: no need to check if barriers supported here as we would
* not pass the test in make_request_common in that case
*/
if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
/* dp_flags |= DP_HARDBARRIER; */
}
if (req->master_bio->bi_rw & REQ_SYNC)
dp_flags |= DP_RW_SYNC;
/* for now handle SYNCIO and UNPLUG
* as if they still were one and the same flag */
if (req->master_bio->bi_rw & REQ_UNPLUG)
dp_flags |= DP_RW_SYNC;
dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
if (mdev->state.conn >= C_SYNC_SOURCE &&
mdev->state.conn <= C_PAUSED_SYNC_T)
dp_flags |= DP_MAY_SET_IN_SYNC;
......@@ -2414,10 +2544,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
p.head.magic = BE_DRBD_MAGIC;
p.head.command = cpu_to_be16(cmd);
p.head.length =
cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
p.head.h80.magic = BE_DRBD_MAGIC;
p.head.h80.command = cpu_to_be16(cmd);
p.head.h80.length =
cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
} else {
p.head.h95.magic = BE_DRBD_MAGIC_BIG;
p.head.h95.command = cpu_to_be16(cmd);
p.head.h95.length =
cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
}
p.sector = cpu_to_be64(e->sector);
p.block_id = e->block_id;
......@@ -2430,8 +2567,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
if (!drbd_get_data_sock(mdev))
return 0;
ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
sizeof(p), dgs ? MSG_MORE : 0);
ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
if (ok && dgs) {
dgb = mdev->int_dig_out;
drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
......@@ -2606,7 +2742,13 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
/* .verify_alg = */ {}, 0,
/* .cpu_mask = */ {}, 0,
/* .csums_alg = */ {}, 0,
/* .use_rle = */ 0
/* .use_rle = */ 0,
/* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
/* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
/* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
/* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
/* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
/* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
};
/* Have to use that way, because the layout differs between
......@@ -2617,7 +2759,9 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
.conn = C_STANDALONE,
.disk = D_DISKLESS,
.pdsk = D_UNKNOWN,
.susp = 0
.susp = 0,
.susp_nod = 0,
.susp_fen = 0
} };
}
......@@ -2641,6 +2785,9 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
atomic_set(&mdev->net_cnt, 0);
atomic_set(&mdev->packet_seq, 0);
atomic_set(&mdev->pp_in_use, 0);
atomic_set(&mdev->pp_in_use_by_net, 0);
atomic_set(&mdev->rs_sect_in, 0);
atomic_set(&mdev->rs_sect_ev, 0);
mutex_init(&mdev->md_io_mutex);
mutex_init(&mdev->data.mutex);
......@@ -2667,11 +2814,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
INIT_LIST_HEAD(&mdev->meta.work.q);
INIT_LIST_HEAD(&mdev->resync_work.list);
INIT_LIST_HEAD(&mdev->unplug_work.list);
INIT_LIST_HEAD(&mdev->go_diskless.list);
INIT_LIST_HEAD(&mdev->md_sync_work.list);
INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
mdev->resync_work.cb = w_resync_inactive;
mdev->unplug_work.cb = w_send_write_hint;
mdev->go_diskless.cb = w_go_diskless;
mdev->md_sync_work.cb = w_md_sync;
mdev->bm_io_work.w.cb = w_bitmap_io;
init_timer(&mdev->resync_timer);
......@@ -2683,6 +2832,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
init_waitqueue_head(&mdev->misc_wait);
init_waitqueue_head(&mdev->state_wait);
init_waitqueue_head(&mdev->net_cnt_wait);
init_waitqueue_head(&mdev->ee_wait);
init_waitqueue_head(&mdev->al_wait);
init_waitqueue_head(&mdev->seq_wait);
......@@ -2698,6 +2848,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
void drbd_mdev_cleanup(struct drbd_conf *mdev)
{
int i;
if (mdev->receiver.t_state != None)
dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
mdev->receiver.t_state);
......@@ -2714,9 +2865,13 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
mdev->p_size =
mdev->rs_start =
mdev->rs_total =
mdev->rs_failed =
mdev->rs_mark_left =
mdev->rs_mark_time = 0;
mdev->rs_failed = 0;
mdev->rs_last_events = 0;
mdev->rs_last_sect_ev = 0;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
mdev->rs_mark_left[i] = 0;
mdev->rs_mark_time[i] = 0;
}
D_ASSERT(mdev->net_conf == NULL);
drbd_set_my_capacity(mdev, 0);
......@@ -2727,6 +2882,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
}
drbd_free_resources(mdev);
clear_bit(AL_SUSPENDED, &mdev->flags);
/*
* currently we drbd_init_ee only on module load, so
......@@ -2742,6 +2898,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
D_ASSERT(list_empty(&mdev->meta.work.q));
D_ASSERT(list_empty(&mdev->resync_work.list));
D_ASSERT(list_empty(&mdev->unplug_work.list));
D_ASSERT(list_empty(&mdev->go_diskless.list));
}
......@@ -3281,9 +3438,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
sector_t sector;
int i;
del_timer(&mdev->md_sync_timer);
/* timer may be rearmed by drbd_md_mark_dirty() now. */
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
return;
del_timer(&mdev->md_sync_timer);
/* We use here D_FAILED and not D_ATTACHING because we try to write
* metadata even if we detach due to a disk failure! */
......@@ -3311,12 +3469,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
sector = mdev->ldev->md.md_offset;
if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
clear_bit(MD_DIRTY, &mdev->flags);
} else {
if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
/* this was a try anyways ... */
dev_err(DEV, "meta data update failed!\n");
drbd_chk_io_error(mdev, 1, TRUE);
}
......@@ -3403,6 +3558,28 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
return rv;
}
static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
{
static char *uuid_str[UI_EXTENDED_SIZE] = {
[UI_CURRENT] = "CURRENT",
[UI_BITMAP] = "BITMAP",
[UI_HISTORY_START] = "HISTORY_START",
[UI_HISTORY_END] = "HISTORY_END",
[UI_SIZE] = "SIZE",
[UI_FLAGS] = "FLAGS",
};
if (index >= UI_EXTENDED_SIZE) {
dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
return;
}
dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
uuid_str[index],
(unsigned long long)mdev->ldev->md.uuid[index]);
}
/**
* drbd_md_mark_dirty() - Mark meta data super block as dirty
* @mdev: DRBD device.
......@@ -3411,19 +3588,31 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
* the meta-data super block. This function sets MD_DIRTY, and starts a
* timer that ensures that within five seconds you have to call drbd_md_sync().
*/
#ifdef DEBUG
void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
{
if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
mod_timer(&mdev->md_sync_timer, jiffies + HZ);
mdev->last_md_mark_dirty.line = line;
mdev->last_md_mark_dirty.func = func;
}
}
#else
void drbd_md_mark_dirty(struct drbd_conf *mdev)
{
set_bit(MD_DIRTY, &mdev->flags);
if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
}
#endif
static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
{
int i;
for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
debug_drbd_uuid(mdev, i+1);
}
}
void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
......@@ -3438,6 +3627,7 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
}
mdev->ldev->md.uuid[idx] = val;
debug_drbd_uuid(mdev, idx);
drbd_md_mark_dirty(mdev);
}
......@@ -3447,6 +3637,7 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
if (mdev->ldev->md.uuid[idx]) {
drbd_uuid_move_history(mdev);
mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
debug_drbd_uuid(mdev, UI_HISTORY_START);
}
_drbd_uuid_set(mdev, idx, val);
}
......@@ -3465,6 +3656,7 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
dev_info(DEV, "Creating new current UUID\n");
D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
debug_drbd_uuid(mdev, UI_BITMAP);
get_random_bytes(&val, sizeof(u64));
_drbd_uuid_set(mdev, UI_CURRENT, val);
......@@ -3479,6 +3671,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
drbd_uuid_move_history(mdev);
mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
mdev->ldev->md.uuid[UI_BITMAP] = 0;
debug_drbd_uuid(mdev, UI_HISTORY_START);
debug_drbd_uuid(mdev, UI_BITMAP);
} else {
if (mdev->ldev->md.uuid[UI_BITMAP])
dev_warn(DEV, "bm UUID already set");
......@@ -3486,6 +3680,7 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
mdev->ldev->md.uuid[UI_BITMAP] = val;
mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
debug_drbd_uuid(mdev, UI_BITMAP);
}
drbd_md_mark_dirty(mdev);
}
......@@ -3528,6 +3723,7 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
{
int rv = -EIO;
drbd_resume_al(mdev);
if (get_ldev_if_state(mdev, D_ATTACHING)) {
drbd_bm_clear_all(mdev);
rv = drbd_bm_write(mdev);
......@@ -3560,6 +3756,32 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
return 1;
}
static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
{
D_ASSERT(mdev->state.disk == D_FAILED);
/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
* inc/dec it frequently. Once we are D_DISKLESS, no one will touch
* the protected members anymore, though, so in the after_state_ch work
* it will be safe to free them. */
drbd_force_state(mdev, NS(disk, D_DISKLESS));
/* We need to wait for return of references checked out while we still
* have been D_FAILED, though (drbd_md_sync, bitmap io). */
wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
clear_bit(GO_DISKLESS, &mdev->flags);
return 1;
}
void drbd_go_diskless(struct drbd_conf *mdev)
{
D_ASSERT(mdev->state.disk == D_FAILED);
if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
/* don't drbd_queue_work_front,
* we need to serialize with the after_state_ch work
* of the -> D_FAILED transition. */
}
/**
* drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
* @mdev: DRBD device.
......@@ -3656,8 +3878,11 @@ static void md_sync_timer_fn(unsigned long data)
static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
{
dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
#ifdef DEBUG
dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
#endif
drbd_md_sync(mdev);
return 1;
}
......
......@@ -33,10 +33,13 @@
#include <linux/blkpg.h>
#include <linux/cpumask.h>
#include "drbd_int.h"
#include "drbd_req.h"
#include "drbd_wrappers.h"
#include <asm/unaligned.h>
#include <linux/drbd_tag_magic.h>
#include <linux/drbd_limits.h>
#include <linux/compiler.h>
#include <linux/kthread.h>
static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
......@@ -169,6 +172,10 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
put_net_conf(mdev);
}
/* The helper may take some time.
* write out any unsynced meta data changes now */
drbd_md_sync(mdev);
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
drbd_bcast_ev_helper(mdev, cmd);
......@@ -202,12 +209,10 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
put_ldev(mdev);
} else {
dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
return mdev->state.pdsk;
nps = mdev->state.pdsk;
goto out;
}
if (fp == FP_STONITH)
_drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
r = drbd_khelper(mdev, "fence-peer");
switch ((r>>8) & 0xff) {
......@@ -252,9 +257,36 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
dev_info(DEV, "fence-peer helper returned %d (%s)\n",
(r>>8) & 0xff, ex_to_string);
out:
if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
/* The handler was not successful... unfreeze here, the
state engine can not unfreeze... */
_drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
}
return nps;
}
static int _try_outdate_peer_async(void *data)
{
struct drbd_conf *mdev = (struct drbd_conf *)data;
enum drbd_disk_state nps;
nps = drbd_try_outdate_peer(mdev);
drbd_request_state(mdev, NS(pdsk, nps));
return 0;
}
void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
{
struct task_struct *opa;
opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
if (IS_ERR(opa))
dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
}
int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
{
......@@ -394,6 +426,39 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
return r;
}
static struct drbd_conf *ensure_mdev(int minor, int create)
{
struct drbd_conf *mdev;
if (minor >= minor_count)
return NULL;
mdev = minor_to_mdev(minor);
if (!mdev && create) {
struct gendisk *disk = NULL;
mdev = drbd_new_device(minor);
spin_lock_irq(&drbd_pp_lock);
if (minor_table[minor] == NULL) {
minor_table[minor] = mdev;
disk = mdev->vdisk;
mdev = NULL;
} /* else: we lost the race */
spin_unlock_irq(&drbd_pp_lock);
if (disk) /* we won the race above */
/* in case we ever add a drbd_delete_device(),
* don't forget the del_gendisk! */
add_disk(disk);
else /* we lost the race above */
drbd_free_mdev(mdev);
mdev = minor_to_mdev(minor);
}
return mdev;
}
static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
......@@ -494,6 +559,8 @@ char *ppsize(char *buf, unsigned long long size)
void drbd_suspend_io(struct drbd_conf *mdev)
{
set_bit(SUSPEND_IO, &mdev->flags);
if (is_susp(mdev->state))
return;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
}
......@@ -713,9 +780,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
blk_queue_segment_boundary(q, PAGE_SIZE-1);
blk_stack_limits(&q->limits, &b->limits, 0);
if (b->merge_bvec_fn)
dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
b->merge_bvec_fn);
dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
......@@ -729,14 +793,16 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
/* serialize deconfig (worker exiting, doing cleanup)
* and reconfig (drbdsetup disk, drbdsetup net)
*
* wait for a potentially exiting worker, then restart it,
* or start a new one.
* Wait for a potentially exiting worker, then restart it,
* or start a new one. Flush any pending work, there may still be an
* after_state_change queued.
*/
static void drbd_reconfig_start(struct drbd_conf *mdev)
{
wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
drbd_thread_start(&mdev->worker);
drbd_flush_workqueue(mdev);
}
/* if still unconfigured, stops worker again.
......@@ -756,6 +822,29 @@ static void drbd_reconfig_done(struct drbd_conf *mdev)
wake_up(&mdev->state_wait);
}
/* Make sure IO is suspended before calling this function(). */
static void drbd_suspend_al(struct drbd_conf *mdev)
{
int s = 0;
if (lc_try_lock(mdev->act_log)) {
drbd_al_shrink(mdev);
lc_unlock(mdev->act_log);
} else {
dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
return;
}
spin_lock_irq(&mdev->req_lock);
if (mdev->state.conn < C_CONNECTED)
s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
spin_unlock_irq(&mdev->req_lock);
if (s)
dev_info(DEV, "Suspended AL updates\n");
}
/* does always return 0;
* interesting return code is in reply->ret_code */
static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
......@@ -769,6 +858,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
struct inode *inode, *inode2;
struct lru_cache *resync_lru = NULL;
union drbd_state ns, os;
unsigned int max_seg_s;
int rv;
int cp_discovered = 0;
int logical_block_size;
......@@ -803,6 +893,15 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
goto fail;
}
if (get_net_conf(mdev)) {
int prot = mdev->net_conf->wire_protocol;
put_net_conf(mdev);
if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
retcode = ERR_STONITH_AND_PROT_A;
goto fail;
}
}
nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
if (IS_ERR(nbc->lo_file)) {
dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
......@@ -924,7 +1023,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
drbd_suspend_io(mdev);
/* also wait for the last barrier ack. */
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
/* and for any other previously queued work */
drbd_flush_workqueue(mdev);
......@@ -1021,7 +1120,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
else
clear_bit(CRASHED_PRIMARY, &mdev->flags);
if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
!(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
set_bit(CRASHED_PRIMARY, &mdev->flags);
cp_discovered = 1;
}
......@@ -1031,7 +1131,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
mdev->read_cnt = 0;
mdev->writ_cnt = 0;
drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
max_seg_s = DRBD_MAX_SEGMENT_SIZE;
if (mdev->state.conn == C_CONNECTED) {
/* We are Primary, Connected, and now attach a new local
* backing store. We must not increase the user visible maximum
* bio size on this device to something the peer may not be
* able to handle. */
if (mdev->agreed_pro_version < 94)
max_seg_s = queue_max_segment_size(mdev->rq_queue);
else if (mdev->agreed_pro_version == 94)
max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
/* else: drbd 8.3.9 and later, stay with default */
}
drbd_setup_queue_param(mdev, max_seg_s);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,
......@@ -1079,6 +1192,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
drbd_al_to_on_disk_bm(mdev);
}
if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
drbd_suspend_al(mdev); /* IO is still suspended here... */
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
ns.i = os.i;
......@@ -1235,7 +1351,16 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
&& (new_conf->wire_protocol != DRBD_PROT_C)) {
retcode = ERR_NOT_PROTO_C;
goto fail;
};
}
if (get_ldev(mdev)) {
enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
put_ldev(mdev);
if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
retcode = ERR_STONITH_AND_PROT_A;
goto fail;
}
}
if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
retcode = ERR_DISCARD;
......@@ -1350,6 +1475,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
}
}
drbd_flush_workqueue(mdev);
spin_lock_irq(&mdev->req_lock);
if (mdev->net_conf != NULL) {
retcode = ERR_NET_CONFIGURED;
......@@ -1388,10 +1514,9 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
mdev->int_dig_out=int_dig_out;
mdev->int_dig_in=int_dig_in;
mdev->int_dig_vv=int_dig_vv;
retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
reply->ret_code = retcode;
drbd_reconfig_done(mdev);
......@@ -1546,6 +1671,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
struct crypto_hash *csums_tfm = NULL;
struct syncer_conf sc;
cpumask_var_t new_cpu_mask;
int *rs_plan_s = NULL;
int fifo_size;
if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
retcode = ERR_NOMEM;
......@@ -1557,6 +1684,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
sc.rate = DRBD_RATE_DEF;
sc.after = DRBD_AFTER_DEF;
sc.al_extents = DRBD_AL_EXTENTS_DEF;
sc.on_no_data = DRBD_ON_NO_DATA_DEF;
sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
} else
memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
......@@ -1634,6 +1767,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
}
#undef AL_MAX
/* to avoid spurious errors when configuring minors before configuring
* the minors they depend on: if necessary, first create the minor we
* depend on */
if (sc.after >= 0)
ensure_mdev(sc.after, 1);
/* most sanity checks done, try to assign the new sync-after
* dependency. need to hold the global lock in there,
* to avoid a race in the dependency loop check. */
......@@ -1641,6 +1780,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
if (retcode != NO_ERROR)
goto fail;
fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
if (!rs_plan_s) {
dev_err(DEV, "kmalloc of fifo_buffer failed");
retcode = ERR_NOMEM;
goto fail;
}
}
/* ok, assign the rest of it as well.
* lock against receive_SyncParam() */
spin_lock(&mdev->peer_seq_lock);
......@@ -1657,6 +1806,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
mdev->verify_tfm = verify_tfm;
verify_tfm = NULL;
}
if (fifo_size != mdev->rs_plan_s.size) {
kfree(mdev->rs_plan_s.values);
mdev->rs_plan_s.values = rs_plan_s;
mdev->rs_plan_s.size = fifo_size;
mdev->rs_planed = 0;
rs_plan_s = NULL;
}
spin_unlock(&mdev->peer_seq_lock);
if (get_ldev(mdev)) {
......@@ -1688,6 +1846,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
fail:
kfree(rs_plan_s);
free_cpumask_var(new_cpu_mask);
crypto_free_hash(csums_tfm);
crypto_free_hash(verify_tfm);
......@@ -1721,12 +1880,38 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
return 0;
}
static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
{
int rv;
rv = drbd_bmio_set_n_write(mdev);
drbd_suspend_al(mdev);
return rv;
}
static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
int retcode;
reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
if (retcode < SS_SUCCESS) {
if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
/* The peer will get a resync upon connect anyways. Just make that
into a full resync. */
retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
if (retcode >= SS_SUCCESS) {
/* open coded drbd_bitmap_io() */
if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
"set_n_write from invalidate_peer"))
retcode = ERR_IO_MD_DISK;
}
} else
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
}
reply->ret_code = retcode;
return 0;
}
......@@ -1765,7 +1950,21 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
drbd_uuid_new_current(mdev);
clear_bit(NEW_CUR_UUID, &mdev->flags);
drbd_md_sync(mdev);
}
drbd_suspend_io(mdev);
reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
if (reply->ret_code == SS_SUCCESS) {
if (mdev->state.conn < C_CONNECTED)
tl_clear(mdev);
if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
tl_restart(mdev, fail_frozen_disk_io);
}
drbd_resume_io(mdev);
return 0;
}
......@@ -1941,40 +2140,6 @@ static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
return 0;
}
static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
{
struct drbd_conf *mdev;
if (nlp->drbd_minor >= minor_count)
return NULL;
mdev = minor_to_mdev(nlp->drbd_minor);
if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
struct gendisk *disk = NULL;
mdev = drbd_new_device(nlp->drbd_minor);
spin_lock_irq(&drbd_pp_lock);
if (minor_table[nlp->drbd_minor] == NULL) {
minor_table[nlp->drbd_minor] = mdev;
disk = mdev->vdisk;
mdev = NULL;
} /* else: we lost the race */
spin_unlock_irq(&drbd_pp_lock);
if (disk) /* we won the race above */
/* in case we ever add a drbd_delete_device(),
* don't forget the del_gendisk! */
add_disk(disk);
else /* we lost the race above */
drbd_free_mdev(mdev);
mdev = minor_to_mdev(nlp->drbd_minor);
}
return mdev;
}
struct cn_handler_struct {
int (*function)(struct drbd_conf *,
struct drbd_nl_cfg_req *,
......@@ -2035,7 +2200,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
goto fail;
}
mdev = ensure_mdev(nlp);
mdev = ensure_mdev(nlp->drbd_minor,
(nlp->flags & DRBD_NL_CREATE_DEVICE));
if (!mdev) {
retcode = ERR_MINOR_INVALID;
goto fail;
......
......@@ -57,6 +57,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
unsigned long db, dt, dbdt, rt, rs_left;
unsigned int res;
int i, x, y;
int stalled = 0;
drbd_get_syncer_progress(mdev, &rs_left, &res);
......@@ -90,18 +91,17 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
* db: blocks written from mark until now
* rt: remaining time
*/
dt = (jiffies - mdev->rs_mark_time) / HZ;
if (dt > 20) {
/* if we made no update to rs_mark_time for too long,
* we are stalled. show that. */
seq_printf(seq, "stalled\n");
return;
}
/* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
* at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
* least DRBD_SYNC_MARK_STEP time before it will be modified. */
i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
stalled = 1;
if (!dt)
dt++;
db = mdev->rs_mark_left - rs_left;
db = mdev->rs_mark_left[i] - rs_left;
rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
seq_printf(seq, "finish: %lu:%02lu:%02lu",
......@@ -118,7 +118,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
/* mean speed since syncer started
* we do account for PausedSync periods */
dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
if (dt <= 0)
if (dt == 0)
dt = 1;
db = mdev->rs_total - rs_left;
dbdt = Bit2KB(db/dt);
......@@ -128,7 +128,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
else
seq_printf(seq, " (%ld)", dbdt);
seq_printf(seq, " K/sec\n");
if (mdev->state.conn == C_SYNC_TARGET) {
if (mdev->c_sync_rate > 1000)
seq_printf(seq, " want: %d,%03d",
mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
else
seq_printf(seq, " want: %d", mdev->c_sync_rate);
}
seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
}
static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
......@@ -196,7 +203,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%2d: cs:Unconfigured\n", i);
} else {
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
"lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
i, sn,
......@@ -206,11 +213,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
drbd_disk_str(mdev->state.pdsk),
(mdev->net_conf == NULL ? ' ' :
(mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
mdev->state.susp ? 's' : 'r',
is_susp(mdev->state) ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
mdev->congestion_reason ?: '-',
test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
mdev->send_cnt/2,
mdev->recv_cnt/2,
mdev->writ_cnt/2,
......
......@@ -241,7 +241,7 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &reclaimed, w.list)
drbd_free_ee(mdev, e);
drbd_free_net_ee(mdev, e);
}
/**
......@@ -298,9 +298,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
* Is also used from inside an other spin_lock_irq(&mdev->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
{
atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
int i;
if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
i = page_chain_free(page);
else {
......@@ -311,10 +313,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
drbd_pp_vacant += i;
spin_unlock(&drbd_pp_lock);
}
atomic_sub(i, &mdev->pp_in_use);
i = atomic_read(&mdev->pp_in_use);
i = atomic_sub_return(i, a);
if (i < 0)
dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
is_net ? "pp_in_use_by_net" : "pp_in_use", i);
wake_up(&drbd_pp_wait);
}
......@@ -365,7 +367,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
e->size = data_size;
e->flags = 0;
e->sector = sector;
e->sector = sector;
e->block_id = id;
return e;
......@@ -375,9 +376,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
return NULL;
}
void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
{
drbd_pp_free(mdev, e->pages);
if (e->flags & EE_HAS_DIGEST)
kfree(e->digest);
drbd_pp_free(mdev, e->pages, is_net);
D_ASSERT(atomic_read(&e->pending_bios) == 0);
D_ASSERT(hlist_unhashed(&e->colision));
mempool_free(e, drbd_ee_mempool);
......@@ -388,13 +391,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
LIST_HEAD(work_list);
struct drbd_epoch_entry *e, *t;
int count = 0;
int is_net = list == &mdev->net_ee;
spin_lock_irq(&mdev->req_lock);
list_splice_init(list, &work_list);
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &work_list, w.list) {
drbd_free_ee(mdev, e);
drbd_free_some_ee(mdev, e, is_net);
count++;
}
return count;
......@@ -423,7 +427,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &reclaimed, w.list)
drbd_free_ee(mdev, e);
drbd_free_net_ee(mdev, e);
/* possible callbacks here:
* e_end_block, and e_end_resync_block, e_send_discard_ack.
......@@ -719,14 +723,14 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
static int drbd_send_fp(struct drbd_conf *mdev,
struct socket *sock, enum drbd_packets cmd)
{
struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
struct p_header80 *h = &mdev->data.sbuf.header.h80;
return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
}
static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
{
struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
struct p_header80 *h = &mdev->data.rbuf.header.h80;
int rr;
rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
......@@ -776,9 +780,6 @@ static int drbd_connect(struct drbd_conf *mdev)
D_ASSERT(!mdev->data.socket);
if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
return -2;
......@@ -927,6 +928,11 @@ static int drbd_connect(struct drbd_conf *mdev)
drbd_thread_start(&mdev->asender);
if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
put_ldev(mdev);
}
if (!drbd_send_protocol(mdev))
return -1;
drbd_send_sync_param(mdev, &mdev->sync_conf);
......@@ -946,22 +952,28 @@ static int drbd_connect(struct drbd_conf *mdev)
return -1;
}
static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
{
union p_header *h = &mdev->data.rbuf.header;
int r;
r = drbd_recv(mdev, h, sizeof(*h));
if (unlikely(r != sizeof(*h))) {
dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
return FALSE;
};
h->command = be16_to_cpu(h->command);
h->length = be16_to_cpu(h->length);
if (unlikely(h->magic != BE_DRBD_MAGIC)) {
dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
(long)be32_to_cpu(h->magic),
h->command, h->length);
}
if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
*cmd = be16_to_cpu(h->h80.command);
*packet_size = be16_to_cpu(h->h80.length);
} else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
*cmd = be16_to_cpu(h->h95.command);
*packet_size = be32_to_cpu(h->h95.length);
} else {
dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
be32_to_cpu(h->h80.magic),
be16_to_cpu(h->h80.command),
be16_to_cpu(h->h80.length));
return FALSE;
}
mdev->last_received = jiffies;
......@@ -1268,17 +1280,12 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
return 1;
}
static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
int rv, issue_flush;
struct p_barrier *p = (struct p_barrier *)h;
struct p_barrier *p = &mdev->data.rbuf.barrier;
struct drbd_epoch *epoch;
ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
rv = drbd_recv(mdev, h->payload, h->length);
ERR_IF(rv != h->length) return FALSE;
inc_unacked(mdev);
if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
......@@ -1457,7 +1464,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
data_size -= rr;
}
kunmap(page);
drbd_pp_free(mdev, page);
drbd_pp_free(mdev, page, 0);
return rv;
}
......@@ -1562,30 +1569,29 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
list_add(&e->w.list, &mdev->sync_ee);
spin_unlock_irq(&mdev->req_lock);
atomic_add(data_size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
return TRUE;
/* drbd_submit_ee currently fails for one reason only:
* not being able to allocate enough bios.
* Is dropping the connection going to help? */
spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list);
spin_unlock_irq(&mdev->req_lock);
drbd_free_ee(mdev, e);
fail:
put_ldev(mdev);
return FALSE;
}
static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct drbd_request *req;
sector_t sector;
unsigned int header_size, data_size;
int ok;
struct p_data *p = (struct p_data *)h;
header_size = sizeof(*p) - sizeof(*h);
data_size = h->length - header_size;
ERR_IF(data_size == 0) return FALSE;
if (drbd_recv(mdev, h->payload, header_size) != header_size)
return FALSE;
struct p_data *p = &mdev->data.rbuf.data;
sector = be64_to_cpu(p->sector);
......@@ -1611,20 +1617,11 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
return ok;
}
static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
sector_t sector;
unsigned int header_size, data_size;
int ok;
struct p_data *p = (struct p_data *)h;
header_size = sizeof(*p) - sizeof(*h);
data_size = h->length - header_size;
ERR_IF(data_size == 0) return FALSE;
if (drbd_recv(mdev, h->payload, header_size) != header_size)
return FALSE;
struct p_data *p = &mdev->data.rbuf.data;
sector = be64_to_cpu(p->sector);
D_ASSERT(p->block_id == ID_SYNCER);
......@@ -1640,9 +1637,11 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
ok = drbd_drain_block(mdev, data_size);
drbd_send_ack_dp(mdev, P_NEG_ACK, p);
drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
}
atomic_add(data_size >> 9, &mdev->rs_sect_in);
return ok;
}
......@@ -1765,24 +1764,27 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
return ret;
}
static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
{
if (mdev->agreed_pro_version >= 95)
return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
(dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
(dpf & DP_FUA ? REQ_FUA : 0) |
(dpf & DP_FLUSH ? REQ_FUA : 0) |
(dpf & DP_DISCARD ? REQ_DISCARD : 0);
else
return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
}
/* mirrored write */
static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
sector_t sector;
struct drbd_epoch_entry *e;
struct p_data *p = (struct p_data *)h;
int header_size, data_size;
struct p_data *p = &mdev->data.rbuf.data;
int rw = WRITE;
u32 dp_flags;
header_size = sizeof(*p) - sizeof(*h);
data_size = h->length - header_size;
ERR_IF(data_size == 0) return FALSE;
if (drbd_recv(mdev, h->payload, header_size) != header_size)
return FALSE;
if (!get_ldev(mdev)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not write mirrored data block "
......@@ -1792,7 +1794,7 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
mdev->peer_seq++;
spin_unlock(&mdev->peer_seq_lock);
drbd_send_ack_dp(mdev, P_NEG_ACK, p);
drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
atomic_inc(&mdev->current_epoch->epoch_size);
return drbd_drain_block(mdev, data_size);
}
......@@ -1839,12 +1841,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
spin_unlock(&mdev->epoch_lock);
dp_flags = be32_to_cpu(p->dp_flags);
if (dp_flags & DP_HARDBARRIER) {
dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
/* rw |= REQ_HARDBARRIER; */
}
if (dp_flags & DP_RW_SYNC)
rw |= REQ_SYNC | REQ_UNPLUG;
rw |= write_flags_to_bio(mdev, dp_flags);
if (dp_flags & DP_MAY_SET_IN_SYNC)
e->flags |= EE_MAY_SET_IN_SYNC;
......@@ -2007,6 +2005,16 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
return TRUE;
/* drbd_submit_ee currently fails for one reason only:
* not being able to allocate enough bios.
* Is dropping the connection going to help? */
spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list);
hlist_del_init(&e->colision);
spin_unlock_irq(&mdev->req_lock);
if (e->flags & EE_CALL_AL_COMPLETE_IO)
drbd_al_complete_io(mdev, e->sector);
out_interrupted:
/* yes, the epoch_size now is imbalanced.
* but we drop the connection anyways, so we don't have a chance to
......@@ -2016,20 +2024,64 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
return FALSE;
}
static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
/* We may throttle resync, if the lower device seems to be busy,
* and current sync rate is above c_min_rate.
*
* To decide whether or not the lower device is busy, we use a scheme similar
* to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
* (more than 64 sectors) of activity we cannot account for with our own resync
* activity, it obviously is "busy".
*
* The current sync rate used here uses only the most recent two step marks,
* to have a short time average so we can react faster.
*/
int drbd_rs_should_slow_down(struct drbd_conf *mdev)
{
struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
unsigned long db, dt, dbdt;
int curr_events;
int throttle = 0;
/* feature disabled? */
if (mdev->sync_conf.c_min_rate == 0)
return 0;
curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
(int)part_stat_read(&disk->part0, sectors[1]) -
atomic_read(&mdev->rs_sect_ev);
if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
unsigned long rs_left;
int i;
mdev->rs_last_events = curr_events;
/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
* approx. */
i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
if (!dt)
dt++;
db = mdev->rs_mark_left[i] - rs_left;
dbdt = Bit2KB(db/dt);
if (dbdt > mdev->sync_conf.c_min_rate)
throttle = 1;
}
return throttle;
}
static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
{
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
struct drbd_epoch_entry *e;
struct digest_info *di = NULL;
int size, digest_size;
int size, verb;
unsigned int fault_type;
struct p_block_req *p =
(struct p_block_req *)h;
const int brps = sizeof(*p)-sizeof(*h);
if (drbd_recv(mdev, h->payload, brps) != brps)
return FALSE;
struct p_block_req *p = &mdev->data.rbuf.block_req;
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
......@@ -2046,12 +2098,31 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
}
if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
if (__ratelimit(&drbd_ratelimit_state))
verb = 1;
switch (cmd) {
case P_DATA_REQUEST:
drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
break;
case P_RS_DATA_REQUEST:
case P_CSUM_RS_REQUEST:
case P_OV_REQUEST:
drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
break;
case P_OV_REPLY:
verb = 0;
dec_rs_pending(mdev);
drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
break;
default:
dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
cmdname(cmd));
}
if (verb && __ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not satisfy peer's read request, "
"no local data.\n");
drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
P_NEG_RS_DREPLY , p);
return drbd_drain_block(mdev, h->length - brps);
/* drain possibly payload */
return drbd_drain_block(mdev, digest_size);
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
......@@ -2063,31 +2134,21 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
return FALSE;
}
switch (h->command) {
switch (cmd) {
case P_DATA_REQUEST:
e->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
break;
/* application IO, don't drbd_rs_begin_io */
goto submit;
case P_RS_DATA_REQUEST:
e->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
/* Eventually this should become asynchronously. Currently it
* blocks the whole receiver just to delay the reading of a
* resync data block.
* the drbd_work_queue mechanism is made for this...
*/
if (!drbd_rs_begin_io(mdev, sector)) {
/* we have been interrupted,
* probably connection lost! */
D_ASSERT(signal_pending(current));
goto out_free_e;
}
break;
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
fault_type = DRBD_FAULT_RS_RD;
digest_size = h->length - brps ;
di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
if (!di)
goto out_free_e;
......@@ -2095,31 +2156,25 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
di->digest_size = digest_size;
di->digest = (((char *)di)+sizeof(struct digest_info));
e->digest = di;
e->flags |= EE_HAS_DIGEST;
if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
goto out_free_e;
e->block_id = (u64)(unsigned long)di;
if (h->command == P_CSUM_RS_REQUEST) {
if (cmd == P_CSUM_RS_REQUEST) {
D_ASSERT(mdev->agreed_pro_version >= 89);
e->w.cb = w_e_end_csum_rs_req;
} else if (h->command == P_OV_REPLY) {
} else if (cmd == P_OV_REPLY) {
e->w.cb = w_e_end_ov_reply;
dec_rs_pending(mdev);
break;
}
if (!drbd_rs_begin_io(mdev, sector)) {
/* we have been interrupted, probably connection lost! */
D_ASSERT(signal_pending(current));
goto out_free_e;
/* drbd_rs_begin_io done when we sent this request,
* but accounting still needs to be done. */
goto submit_for_resync;
}
break;
case P_OV_REQUEST:
if (mdev->state.conn >= C_CONNECTED &&
mdev->state.conn != C_VERIFY_T)
dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
drbd_conn_str(mdev->state.conn));
if (mdev->ov_start_sector == ~(sector_t)0 &&
mdev->agreed_pro_version >= 90) {
mdev->ov_start_sector = sector;
......@@ -2130,37 +2185,63 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
}
e->w.cb = w_e_end_ov_req;
fault_type = DRBD_FAULT_RS_RD;
/* Eventually this should become asynchronous. Currently it
* blocks the whole receiver just to delay the reading of a
* resync data block.
* the drbd_work_queue mechanism is made for this...
*/
if (!drbd_rs_begin_io(mdev, sector)) {
/* we have been interrupted,
* probably connection lost! */
D_ASSERT(signal_pending(current));
goto out_free_e;
}
break;
default:
dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
cmdname(h->command));
cmdname(cmd));
fault_type = DRBD_FAULT_MAX;
goto out_free_e;
}
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
/* Throttle, drbd_rs_begin_io and submit should become asynchronous
* wrt the receiver, but it is not as straightforward as it may seem.
* Various places in the resync start and stop logic assume resync
* requests are processed in order, requeuing this on the worker thread
* introduces a bunch of new code for synchronization between threads.
*
* Unlimited throttling before drbd_rs_begin_io may stall the resync
* "forever", throttling after drbd_rs_begin_io will lock that extent
* for application writes for the same time. For now, just throttle
* here, where the rest of the code expects the receiver to sleep for
* a while, anyways.
*/
/* Throttle before drbd_rs_begin_io, as that locks out application IO;
* this defers syncer requests for some time, before letting at least
* on request through. The resync controller on the receiving side
* will adapt to the incoming rate accordingly.
*
* We cannot throttle here if remote is Primary/SyncTarget:
* we would also throttle its application reads.
* In that case, throttling is done on the SyncTarget only.
*/
if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
msleep(100);
if (drbd_rs_begin_io(mdev, e->sector))
goto out_free_e;
submit_for_resync:
atomic_add(size >> 9, &mdev->rs_sect_ev);
submit:
inc_unacked(mdev);
spin_lock_irq(&mdev->req_lock);
list_add_tail(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
return TRUE;
/* drbd_submit_ee currently fails for one reason only:
* not being able to allocate enough bios.
* Is dropping the connection going to help? */
spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list);
spin_unlock_irq(&mdev->req_lock);
/* no drbd_rs_complete_io(), we are dropping the connection anyways */
out_free_e:
kfree(di);
put_ldev(mdev);
drbd_free_ee(mdev, e);
return FALSE;
......@@ -2699,20 +2780,13 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
return 1;
}
static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct p_protocol *p = (struct p_protocol *)h;
int header_size, data_size;
struct p_protocol *p = &mdev->data.rbuf.protocol;
int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
int p_want_lose, p_two_primaries, cf;
char p_integrity_alg[SHARED_SECRET_MAX] = "";
header_size = sizeof(*p) - sizeof(*h);
data_size = h->length - header_size;
if (drbd_recv(mdev, h->payload, header_size) != header_size)
return FALSE;
p_proto = be32_to_cpu(p->protocol);
p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
......@@ -2805,39 +2879,46 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
return tfm;
}
static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
{
int ok = TRUE;
struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
unsigned int header_size, data_size, exp_max_sz;
struct crypto_hash *verify_tfm = NULL;
struct crypto_hash *csums_tfm = NULL;
const int apv = mdev->agreed_pro_version;
int *rs_plan_s = NULL;
int fifo_size = 0;
exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
+ SHARED_SECRET_MAX
: /* 89 */ sizeof(struct p_rs_param_89);
: apv <= 94 ? sizeof(struct p_rs_param_89)
: /* apv >= 95 */ sizeof(struct p_rs_param_95);
if (h->length > exp_max_sz) {
if (packet_size > exp_max_sz) {
dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
h->length, exp_max_sz);
packet_size, exp_max_sz);
return FALSE;
}
if (apv <= 88) {
header_size = sizeof(struct p_rs_param) - sizeof(*h);
data_size = h->length - header_size;
} else /* apv >= 89 */ {
header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
data_size = h->length - header_size;
header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
data_size = packet_size - header_size;
} else if (apv <= 94) {
header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
data_size = packet_size - header_size;
D_ASSERT(data_size == 0);
} else {
header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
data_size = packet_size - header_size;
D_ASSERT(data_size == 0);
}
/* initialize verify_alg and csums_alg */
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
if (drbd_recv(mdev, h->payload, header_size) != header_size)
if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
return FALSE;
mdev->sync_conf.rate = be32_to_cpu(p->rate);
......@@ -2896,6 +2977,22 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
}
}
if (apv > 94) {
mdev->sync_conf.rate = be32_to_cpu(p->rate);
mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
if (!rs_plan_s) {
dev_err(DEV, "kmalloc of fifo_buffer failed");
goto disconnect;
}
}
}
spin_lock(&mdev->peer_seq_lock);
/* lock against drbd_nl_syncer_conf() */
......@@ -2913,6 +3010,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
mdev->csums_tfm = csums_tfm;
dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
}
if (fifo_size != mdev->rs_plan_s.size) {
kfree(mdev->rs_plan_s.values);
mdev->rs_plan_s.values = rs_plan_s;
mdev->rs_plan_s.size = fifo_size;
mdev->rs_planed = 0;
}
spin_unlock(&mdev->peer_seq_lock);
}
......@@ -2946,19 +3049,15 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
(unsigned long long)a, (unsigned long long)b);
}
static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct p_sizes *p = (struct p_sizes *)h;
struct p_sizes *p = &mdev->data.rbuf.sizes;
enum determine_dev_size dd = unchanged;
unsigned int max_seg_s;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
if (drbd_recv(mdev, h->payload, h->length) != h->length)
return FALSE;
p_size = be64_to_cpu(p->d_size);
p_usize = be64_to_cpu(p->u_size);
......@@ -3028,6 +3127,8 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
if (mdev->agreed_pro_version < 94)
max_seg_s = be32_to_cpu(p->max_segment_size);
else if (mdev->agreed_pro_version == 94)
max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
else /* drbd 8.3.8 onwards */
max_seg_s = DRBD_MAX_SEGMENT_SIZE;
......@@ -3061,16 +3162,12 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct p_uuids *p = (struct p_uuids *)h;
struct p_uuids *p = &mdev->data.rbuf.uuids;
u64 *p_uuid;
int i;
ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
if (drbd_recv(mdev, h->payload, h->length) != h->length)
return FALSE;
p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
......@@ -3106,6 +3203,11 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
drbd_md_sync(mdev);
}
put_ldev(mdev);
} else if (mdev->state.disk < D_INCONSISTENT &&
mdev->state.role == R_PRIMARY) {
/* I am a diskless primary, the peer just created a new current UUID
for me. */
drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
}
/* Before we test for the disk state, we should wait until an eventually
......@@ -3149,16 +3251,12 @@ static union drbd_state convert_state(union drbd_state ps)
return ms;
}
static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct p_req_state *p = (struct p_req_state *)h;
struct p_req_state *p = &mdev->data.rbuf.req_state;
union drbd_state mask, val;
int rv;
ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
if (drbd_recv(mdev, h->payload, h->length) != h->length)
return FALSE;
mask.i = be32_to_cpu(p->mask);
val.i = be32_to_cpu(p->val);
......@@ -3179,20 +3277,14 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
static int receive_state(struct drbd_conf *mdev, struct p_header *h)
static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct p_state *p = (struct p_state *)h;
enum drbd_conns nconn, oconn;
union drbd_state ns, peer_state;
struct p_state *p = &mdev->data.rbuf.state;
union drbd_state os, ns, peer_state;
enum drbd_disk_state real_peer_disk;
enum chg_state_flags cs_flags;
int rv;
ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
return FALSE;
if (drbd_recv(mdev, h->payload, h->length) != h->length)
return FALSE;
peer_state.i = be32_to_cpu(p->state);
real_peer_disk = peer_state.disk;
......@@ -3203,38 +3295,72 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
spin_lock_irq(&mdev->req_lock);
retry:
oconn = nconn = mdev->state.conn;
os = ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
if (nconn == C_WF_REPORT_PARAMS)
nconn = C_CONNECTED;
/* peer says his disk is uptodate, while we think it is inconsistent,
* and this happens while we think we have a sync going on. */
if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
/* If we are (becoming) SyncSource, but peer is still in sync
* preparation, ignore its uptodate-ness to avoid flapping, it
* will change to inconsistent once the peer reaches active
* syncing states.
* It may have changed syncer-paused flags, however, so we
* cannot ignore this completely. */
if (peer_state.conn > C_CONNECTED &&
peer_state.conn < C_SYNC_SOURCE)
real_peer_disk = D_INCONSISTENT;
/* if peer_state changes to connected at the same time,
* it explicitly notifies us that it finished resync.
* Maybe we should finish it up, too? */
else if (os.conn >= C_SYNC_SOURCE &&
peer_state.conn == C_CONNECTED) {
if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
drbd_resync_finished(mdev);
return TRUE;
}
}
/* peer says his disk is inconsistent, while we think it is uptodate,
* and this happens while the peer still thinks we have a sync going on,
* but we think we are already done with the sync.
* We ignore this to avoid flapping pdsk.
* This should not happen, if the peer is a recent version of drbd. */
if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
real_peer_disk = D_UP_TO_DATE;
if (ns.conn == C_WF_REPORT_PARAMS)
ns.conn = C_CONNECTED;
if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
get_ldev_if_state(mdev, D_NEGOTIATING)) {
int cr; /* consider resync */
/* if we established a new connection */
cr = (oconn < C_CONNECTED);
cr = (os.conn < C_CONNECTED);
/* if we had an established connection
* and one of the nodes newly attaches a disk */
cr |= (oconn == C_CONNECTED &&
cr |= (os.conn == C_CONNECTED &&
(peer_state.disk == D_NEGOTIATING ||
mdev->state.disk == D_NEGOTIATING));
os.disk == D_NEGOTIATING));
/* if we have both been inconsistent, and the peer has been
* forced to be UpToDate with --overwrite-data */
cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
/* if we had been plain connected, and the admin requested to
* start a sync by "invalidate" or "invalidate-remote" */
cr |= (oconn == C_CONNECTED &&
cr |= (os.conn == C_CONNECTED &&
(peer_state.conn >= C_STARTING_SYNC_S &&
peer_state.conn <= C_WF_BITMAP_T));
if (cr)
nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
put_ldev(mdev);
if (nconn == C_MASK) {
nconn = C_CONNECTED;
if (ns.conn == C_MASK) {
ns.conn = C_CONNECTED;
if (mdev->state.disk == D_NEGOTIATING) {
drbd_force_state(mdev, NS(disk, D_DISKLESS));
} else if (peer_state.disk == D_NEGOTIATING) {
......@@ -3244,7 +3370,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
} else {
if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
return FALSE;
D_ASSERT(oconn == C_WF_REPORT_PARAMS);
D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
return FALSE;
}
......@@ -3252,18 +3378,28 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
}
spin_lock_irq(&mdev->req_lock);
if (mdev->state.conn != oconn)
if (mdev->state.i != os.i)
goto retry;
clear_bit(CONSIDER_RESYNC, &mdev->flags);
ns.i = mdev->state.i;
ns.conn = nconn;
ns.peer = peer_state.role;
ns.pdsk = real_peer_disk;
ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
ns.disk = mdev->new_state_tmp.disk;
rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
test_bit(NEW_CUR_UUID, &mdev->flags)) {
/* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
for temporal network outages! */
spin_unlock_irq(&mdev->req_lock);
dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
tl_clear(mdev);
drbd_uuid_new_current(mdev);
clear_bit(NEW_CUR_UUID, &mdev->flags);
drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
return FALSE;
}
rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
......@@ -3272,8 +3408,8 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
return FALSE;
}
if (oconn > C_WF_REPORT_PARAMS) {
if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
if (os.conn > C_WF_REPORT_PARAMS) {
if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
peer_state.disk != D_NEGOTIATING ) {
/* we want resync, peer has not yet decided to sync... */
/* Nowadays only used when forcing a node into primary role and
......@@ -3290,9 +3426,9 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct p_rs_uuid *p = (struct p_rs_uuid *)h;
struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
wait_event(mdev->misc_wait,
mdev->state.conn == C_WF_SYNC_UUID ||
......@@ -3301,10 +3437,6 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
/* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
if (drbd_recv(mdev, h->payload, h->length) != h->length)
return FALSE;
/* Here the _drbd_uuid_ functions are right, current should
_not_ be rotated into the history */
if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
......@@ -3323,14 +3455,14 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
enum receive_bitmap_ret { OK, DONE, FAILED };
static enum receive_bitmap_ret
receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
unsigned long *buffer, struct bm_xfer_ctx *c)
{
unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
unsigned want = num_words * sizeof(long);
if (want != h->length) {
dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
if (want != data_size) {
dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
return FAILED;
}
if (want == 0)
......@@ -3359,7 +3491,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
u64 tmp;
unsigned long s = c->bit_offset;
unsigned long e;
int len = p->head.length - (sizeof(*p) - sizeof(p->head));
int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
int toggle = DCBP_get_start(p);
int have;
int bits;
......@@ -3428,7 +3560,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
const char *direction, struct bm_xfer_ctx *c)
{
/* what would it take to transfer it "plaintext" */
unsigned plain = sizeof(struct p_header) *
unsigned plain = sizeof(struct p_header80) *
((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
+ c->bm_words * sizeof(long);
unsigned total = c->bytes[0] + c->bytes[1];
......@@ -3466,12 +3598,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
in order to be agnostic to the 32 vs 64 bits issue.
returns 0 on failure, 1 if we successfully received it. */
static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct bm_xfer_ctx c;
void *buffer;
enum receive_bitmap_ret ret;
int ok = FALSE;
struct p_header80 *h = &mdev->data.rbuf.header.h80;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
......@@ -3491,39 +3624,39 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
};
do {
if (h->command == P_BITMAP) {
ret = receive_bitmap_plain(mdev, h, buffer, &c);
} else if (h->command == P_COMPRESSED_BITMAP) {
if (cmd == P_BITMAP) {
ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
} else if (cmd == P_COMPRESSED_BITMAP) {
/* MAYBE: sanity check that we speak proto >= 90,
* and the feature is enabled! */
struct p_compressed_bm *p;
if (h->length > BM_PACKET_PAYLOAD_BYTES) {
if (data_size > BM_PACKET_PAYLOAD_BYTES) {
dev_err(DEV, "ReportCBitmap packet too large\n");
goto out;
}
/* use the page buff */
p = buffer;
memcpy(p, h, sizeof(*h));
if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
goto out;
if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
if (data_size <= (sizeof(*p) - sizeof(p->head))) {
dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
return FAILED;
}
ret = decode_bitmap_c(mdev, p, &c);
} else {
dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
goto out;
}
c.packets[h->command == P_BITMAP]++;
c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
c.packets[cmd == P_BITMAP]++;
c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
if (ret != OK)
break;
if (!drbd_recv_header(mdev, h))
if (!drbd_recv_header(mdev, &cmd, &data_size))
goto out;
} while (ret == OK);
if (ret == FAILED)
......@@ -3554,17 +3687,16 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
return ok;
}
static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
/* TODO zero copy sink :) */
static char sink[128];
int size, want, r;
if (!silent)
dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
h->command, h->length);
cmd, data_size);
size = h->length;
size = data_size;
while (size > 0) {
want = min_t(int, size, sizeof(sink));
r = drbd_recv(mdev, sink, want);
......@@ -3574,17 +3706,7 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
return size == 0;
}
static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
{
return receive_skip_(mdev, h, 0);
}
static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
{
return receive_skip_(mdev, h, 1);
}
static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
if (mdev->state.disk >= D_INCONSISTENT)
drbd_kick_lo(mdev);
......@@ -3596,108 +3718,94 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
static drbd_cmd_handler_f drbd_default_handler[] = {
[P_DATA] = receive_Data,
[P_DATA_REPLY] = receive_DataReply,
[P_RS_DATA_REPLY] = receive_RSDataReply,
[P_BARRIER] = receive_Barrier,
[P_BITMAP] = receive_bitmap,
[P_COMPRESSED_BITMAP] = receive_bitmap,
[P_UNPLUG_REMOTE] = receive_UnplugRemote,
[P_DATA_REQUEST] = receive_DataRequest,
[P_RS_DATA_REQUEST] = receive_DataRequest,
[P_SYNC_PARAM] = receive_SyncParam,
[P_SYNC_PARAM89] = receive_SyncParam,
[P_PROTOCOL] = receive_protocol,
[P_UUIDS] = receive_uuids,
[P_SIZES] = receive_sizes,
[P_STATE] = receive_state,
[P_STATE_CHG_REQ] = receive_req_state,
[P_SYNC_UUID] = receive_sync_uuid,
[P_OV_REQUEST] = receive_DataRequest,
[P_OV_REPLY] = receive_DataRequest,
[P_CSUM_RS_REQUEST] = receive_DataRequest,
[P_DELAY_PROBE] = receive_skip_silent,
typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
struct data_cmd {
int expect_payload;
size_t pkt_size;
drbd_cmd_handler_f function;
};
static struct data_cmd drbd_cmd_handler[] = {
[P_DATA] = { 1, sizeof(struct p_data), receive_Data },
[P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
[P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
[P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
[P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
[P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
[P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
[P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
[P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
[P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
[P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
[P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
[P_STATE] = { 0, sizeof(struct p_state), receive_state },
[P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
[P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
[P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
/* anything missing from this table is in
* the asender_tbl, see get_asender_cmd */
[P_MAX_CMD] = NULL,
[P_MAX_CMD] = { 0, 0, NULL },
};
static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
static drbd_cmd_handler_f *drbd_opt_cmd_handler;
/* All handler functions that expect a sub-header get that sub-heder in
mdev->data.rbuf.header.head.payload.
Usually in mdev->data.rbuf.header.head the callback can find the usual
p_header, but they may not rely on that. Since there is also p_header95 !
*/
static void drbdd(struct drbd_conf *mdev)
{
drbd_cmd_handler_f handler;
struct p_header *header = &mdev->data.rbuf.header;
union p_header *header = &mdev->data.rbuf.header;
unsigned int packet_size;
enum drbd_packets cmd;
size_t shs; /* sub header size */
int rv;
while (get_t_state(&mdev->receiver) == Running) {
drbd_thread_current_set_cpu(mdev);
if (!drbd_recv_header(mdev, header)) {
drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
break;
if (!drbd_recv_header(mdev, &cmd, &packet_size))
goto err_out;
if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
goto err_out;
}
if (header->command < P_MAX_CMD)
handler = drbd_cmd_handler[header->command];
else if (P_MAY_IGNORE < header->command
&& header->command < P_MAX_OPT_CMD)
handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
else if (header->command > P_MAX_OPT_CMD)
handler = receive_skip;
else
handler = NULL;
shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
rv = drbd_recv(mdev, &header->h80.payload, shs);
if (unlikely(rv != shs)) {
dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
goto err_out;
}
if (unlikely(!handler)) {
dev_err(DEV, "unknown packet type %d, l: %d!\n",
header->command, header->length);
drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
break;
if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
goto err_out;
}
if (unlikely(!handler(mdev, header))) {
rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
if (unlikely(!rv)) {
dev_err(DEV, "error receiving %s, l: %d!\n",
cmdname(header->command), header->length);
drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
break;
cmdname(cmd), packet_size);
goto err_out;
}
}
}
static void drbd_fail_pending_reads(struct drbd_conf *mdev)
{
struct hlist_head *slot;
struct hlist_node *pos;
struct hlist_node *tmp;
struct drbd_request *req;
int i;
/*
* Application READ requests
*/
spin_lock_irq(&mdev->req_lock);
for (i = 0; i < APP_R_HSIZE; i++) {
slot = mdev->app_reads_hash+i;
hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
/* it may (but should not any longer!)
* be on the work queue; if that assert triggers,
* we need to also grab the
* spin_lock_irq(&mdev->data.work.q_lock);
* and list_del_init here. */
D_ASSERT(list_empty(&req->w.list));
/* It would be nice to complete outside of spinlock.
* But this is easier for now. */
_req_mod(req, connection_lost_while_pending);
}
}
for (i = 0; i < APP_R_HSIZE; i++)
if (!hlist_empty(mdev->app_reads_hash+i))
dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
"%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
spin_unlock_irq(&mdev->req_lock);
if (0) {
err_out:
drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
}
/* If we leave here, we probably want to update at least the
* "Connected" indicator on stable storage. Do so explicitly here. */
drbd_md_sync(mdev);
}
void drbd_flush_workqueue(struct drbd_conf *mdev)
......@@ -3710,6 +3818,36 @@ void drbd_flush_workqueue(struct drbd_conf *mdev)
wait_for_completion(&barr.done);
}
void drbd_free_tl_hash(struct drbd_conf *mdev)
{
struct hlist_head *h;
spin_lock_irq(&mdev->req_lock);
if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
spin_unlock_irq(&mdev->req_lock);
return;
}
/* paranoia code */
for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
if (h->first)
dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
(int)(h - mdev->ee_hash), h->first);
kfree(mdev->ee_hash);
mdev->ee_hash = NULL;
mdev->ee_hash_s = 0;
/* paranoia code */
for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
if (h->first)
dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
(int)(h - mdev->tl_hash), h->first);
kfree(mdev->tl_hash);
mdev->tl_hash = NULL;
mdev->tl_hash_s = 0;
spin_unlock_irq(&mdev->req_lock);
}
static void drbd_disconnect(struct drbd_conf *mdev)
{
enum drbd_fencing_p fp;
......@@ -3727,6 +3865,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
drbd_thread_stop(&mdev->asender);
drbd_free_sock(mdev);
/* wait for current activity to cease. */
spin_lock_irq(&mdev->req_lock);
_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
......@@ -3751,7 +3890,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
/* make sure syncer is stopped and w_resume_next_sg queued */
del_timer_sync(&mdev->resync_timer);
set_bit(STOP_SYNC_TIMER, &mdev->flags);
resync_timer_fn((unsigned long)mdev);
/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
......@@ -3766,11 +3904,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
kfree(mdev->p_uuid);
mdev->p_uuid = NULL;
if (!mdev->state.susp)
if (!is_susp(mdev->state))
tl_clear(mdev);
drbd_fail_pending_reads(mdev);
dev_info(DEV, "Connection closed\n");
drbd_md_sync(mdev);
......@@ -3781,12 +3917,8 @@ static void drbd_disconnect(struct drbd_conf *mdev)
put_ldev(mdev);
}
if (mdev->state.role == R_PRIMARY) {
if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
drbd_request_state(mdev, NS(pdsk, nps));
}
}
if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
drbd_try_outdate_peer_async(mdev);
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
......@@ -3799,32 +3931,14 @@ static void drbd_disconnect(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
if (os.conn == C_DISCONNECTING) {
struct hlist_head *h;
wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
if (!is_susp(mdev->state)) {
/* we must not free the tl_hash
* while application io is still on the fly */
wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
spin_lock_irq(&mdev->req_lock);
/* paranoia code */
for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
if (h->first)
dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
(int)(h - mdev->ee_hash), h->first);
kfree(mdev->ee_hash);
mdev->ee_hash = NULL;
mdev->ee_hash_s = 0;
/* paranoia code */
for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
if (h->first)
dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
(int)(h - mdev->tl_hash), h->first);
kfree(mdev->tl_hash);
mdev->tl_hash = NULL;
mdev->tl_hash_s = 0;
spin_unlock_irq(&mdev->req_lock);
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
drbd_free_tl_hash(mdev);
}
crypto_free_hash(mdev->cram_hmac_tfm);
mdev->cram_hmac_tfm = NULL;
......@@ -3844,6 +3958,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
i = drbd_release_ee(mdev, &mdev->net_ee);
if (i)
dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
i = atomic_read(&mdev->pp_in_use_by_net);
if (i)
dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
i = atomic_read(&mdev->pp_in_use);
if (i)
dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
......@@ -3887,7 +4004,7 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
(struct p_header *)p, sizeof(*p), 0 );
(struct p_header80 *)p, sizeof(*p), 0 );
mutex_unlock(&mdev->data.mutex);
return ok;
}
......@@ -3903,27 +4020,28 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
{
/* ASSERT current == mdev->receiver ... */
struct p_handshake *p = &mdev->data.rbuf.handshake;
const int expect = sizeof(struct p_handshake)
-sizeof(struct p_header);
const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
unsigned int length;
enum drbd_packets cmd;
int rv;
rv = drbd_send_handshake(mdev);
if (!rv)
return 0;
rv = drbd_recv_header(mdev, &p->head);
rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
return 0;
if (p->head.command != P_HAND_SHAKE) {
if (cmd != P_HAND_SHAKE) {
dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
cmdname(p->head.command), p->head.command);
cmdname(cmd), cmd);
return -1;
}
if (p->head.length != expect) {
if (length != expect) {
dev_err(DEV, "expected HandShake length: %u, received: %u\n",
expect, p->head.length);
expect, length);
return -1;
}
......@@ -3981,10 +4099,11 @@ static int drbd_do_auth(struct drbd_conf *mdev)
char *response = NULL;
char *right_response = NULL;
char *peers_ch = NULL;
struct p_header p;
unsigned int key_len = strlen(mdev->net_conf->shared_secret);
unsigned int resp_size;
struct hash_desc desc;
enum drbd_packets cmd;
unsigned int length;
int rv;
desc.tfm = mdev->cram_hmac_tfm;
......@@ -4004,33 +4123,33 @@ static int drbd_do_auth(struct drbd_conf *mdev)
if (!rv)
goto fail;
rv = drbd_recv_header(mdev, &p);
rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
goto fail;
if (p.command != P_AUTH_CHALLENGE) {
if (cmd != P_AUTH_CHALLENGE) {
dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
cmdname(p.command), p.command);
cmdname(cmd), cmd);
rv = 0;
goto fail;
}
if (p.length > CHALLENGE_LEN*2) {
if (length > CHALLENGE_LEN * 2) {
dev_err(DEV, "expected AuthChallenge payload too big.\n");
rv = -1;
goto fail;
}
peers_ch = kmalloc(p.length, GFP_NOIO);
peers_ch = kmalloc(length, GFP_NOIO);
if (peers_ch == NULL) {
dev_err(DEV, "kmalloc of peers_ch failed\n");
rv = -1;
goto fail;
}
rv = drbd_recv(mdev, peers_ch, p.length);
rv = drbd_recv(mdev, peers_ch, length);
if (rv != p.length) {
if (rv != length) {
dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
rv = 0;
goto fail;
......@@ -4045,7 +4164,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
}
sg_init_table(&sg, 1);
sg_set_buf(&sg, peers_ch, p.length);
sg_set_buf(&sg, peers_ch, length);
rv = crypto_hash_digest(&desc, &sg, sg.length, response);
if (rv) {
......@@ -4058,18 +4177,18 @@ static int drbd_do_auth(struct drbd_conf *mdev)
if (!rv)
goto fail;
rv = drbd_recv_header(mdev, &p);
rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
goto fail;
if (p.command != P_AUTH_RESPONSE) {
if (cmd != P_AUTH_RESPONSE) {
dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
cmdname(p.command), p.command);
cmdname(cmd), cmd);
rv = 0;
goto fail;
}
if (p.length != resp_size) {
if (length != resp_size) {
dev_err(DEV, "expected AuthResponse payload of wrong size\n");
rv = 0;
goto fail;
......@@ -4154,7 +4273,7 @@ int drbdd_init(struct drbd_thread *thi)
/* ********* acknowledge sender ******** */
static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_req_state_reply *p = (struct p_req_state_reply *)h;
......@@ -4172,13 +4291,13 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
{
return drbd_send_ping_ack(mdev);
}
static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
{
/* restore idle timeout */
mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
......@@ -4188,7 +4307,7 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
......@@ -4198,11 +4317,15 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
if (get_ldev(mdev)) {
drbd_rs_complete_io(mdev, sector);
drbd_set_in_sync(mdev, sector, blksize);
/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
put_ldev(mdev);
}
dec_rs_pending(mdev);
atomic_add(blksize >> 9, &mdev->rs_sect_in);
return TRUE;
}
......@@ -4258,7 +4381,7 @@ static int validate_req_change_req_state(struct drbd_conf *mdev,
return TRUE;
}
static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
......@@ -4298,7 +4421,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
_ack_id_to_req, __func__ , what);
}
static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
......@@ -4318,7 +4441,7 @@ static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
_ack_id_to_req, __func__ , neg_acked);
}
static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
......@@ -4331,7 +4454,7 @@ static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
_ar_id_to_req, __func__ , neg_acked);
}
static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
{
sector_t sector;
int size;
......@@ -4353,7 +4476,7 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_barrier_ack *p = (struct p_barrier_ack *)h;
......@@ -4362,7 +4485,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
struct drbd_work *w;
......@@ -4379,6 +4502,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
else
ov_oos_print(mdev);
if (!get_ldev(mdev))
return TRUE;
drbd_rs_complete_io(mdev, sector);
dec_rs_pending(mdev);
......@@ -4393,18 +4519,18 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
drbd_resync_finished(mdev);
}
}
put_ldev(mdev);
return TRUE;
}
static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
{
/* IGNORE */
return TRUE;
}
struct asender_cmd {
size_t pkt_size;
int (*process)(struct drbd_conf *mdev, struct p_header *h);
int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
};
static struct asender_cmd *get_asender_cmd(int cmd)
......@@ -4413,8 +4539,8 @@ static struct asender_cmd *get_asender_cmd(int cmd)
/* anything missing from this table is in
* the drbd_cmd_handler (drbd_default_handler) table,
* see the beginning of drbdd() */
[P_PING] = { sizeof(struct p_header), got_Ping },
[P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
[P_PING] = { sizeof(struct p_header80), got_Ping },
[P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
[P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
......@@ -4426,7 +4552,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
[P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
[P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
[P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
[P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
[P_MAX_CMD] = { 0, NULL },
};
if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
......@@ -4437,13 +4563,13 @@ static struct asender_cmd *get_asender_cmd(int cmd)
int drbd_asender(struct drbd_thread *thi)
{
struct drbd_conf *mdev = thi->mdev;
struct p_header *h = &mdev->meta.rbuf.header;
struct p_header80 *h = &mdev->meta.rbuf.header.h80;
struct asender_cmd *cmd = NULL;
int rv, len;
void *buf = h;
int received = 0;
int expect = sizeof(struct p_header);
int expect = sizeof(struct p_header80);
int empty;
sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
......@@ -4467,10 +4593,8 @@ int drbd_asender(struct drbd_thread *thi)
while (1) {
clear_bit(SIGNAL_ASENDER, &mdev->flags);
flush_signals(current);
if (!drbd_process_done_ee(mdev)) {
dev_err(DEV, "process_done_ee() = NOT_OK\n");
if (!drbd_process_done_ee(mdev))
goto reconnect;
}
/* to avoid race with newly queued ACKs */
set_bit(SIGNAL_ASENDER, &mdev->flags);
spin_lock_irq(&mdev->req_lock);
......@@ -4529,21 +4653,23 @@ int drbd_asender(struct drbd_thread *thi)
if (received == expect && cmd == NULL) {
if (unlikely(h->magic != BE_DRBD_MAGIC)) {
dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
(long)be32_to_cpu(h->magic),
h->command, h->length);
dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
be32_to_cpu(h->magic),
be16_to_cpu(h->command),
be16_to_cpu(h->length));
goto reconnect;
}
cmd = get_asender_cmd(be16_to_cpu(h->command));
len = be16_to_cpu(h->length);
if (unlikely(cmd == NULL)) {
dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
(long)be32_to_cpu(h->magic),
h->command, h->length);
dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
be32_to_cpu(h->magic),
be16_to_cpu(h->command),
be16_to_cpu(h->length));
goto disconnect;
}
expect = cmd->pkt_size;
ERR_IF(len != expect-sizeof(struct p_header))
ERR_IF(len != expect-sizeof(struct p_header80))
goto reconnect;
}
if (received == expect) {
......@@ -4553,7 +4679,7 @@ int drbd_asender(struct drbd_thread *thi)
buf = h;
received = 0;
expect = sizeof(struct p_header);
expect = sizeof(struct p_header80);
cmd = NULL;
}
}
......@@ -4561,10 +4687,12 @@ int drbd_asender(struct drbd_thread *thi)
if (0) {
reconnect:
drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
drbd_md_sync(mdev);
}
if (0) {
disconnect:
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
drbd_md_sync(mdev);
}
clear_bit(SIGNAL_ASENDER, &mdev->flags);
......
......@@ -59,10 +59,7 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
{
const unsigned long s = req->rq_state;
/* if it was a write, we may have to set the corresponding
* bit(s) out-of-sync first. If it had a local part, we need to
* release the reference to the activity log. */
if (rw == WRITE) {
/* remove it from the transfer log.
* well, only if it had been there in the first
* place... if it had not (local only or conflicting
......@@ -70,6 +67,11 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
* initialized in drbd_req_new(), so we can list_del() it
* here unconditionally */
list_del(&req->tl_requests);
/* if it was a write, we may have to set the corresponding
* bit(s) out-of-sync first. If it had a local part, we need to
* release the reference to the activity log. */
if (rw == WRITE) {
/* Set out-of-sync unless both OK flags are set
* (local only or remote failed).
* Other places where we set out-of-sync:
......@@ -92,6 +94,7 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
*/
if (s & RQ_LOCAL_MASK) {
if (get_ldev_if_state(mdev, D_FAILED)) {
if (s & RQ_IN_ACT_LOG)
drbd_al_complete_io(mdev, req->sector);
put_ldev(mdev);
} else if (__ratelimit(&drbd_ratelimit_state)) {
......@@ -280,6 +283,14 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
* protocol A or B, barrier ack still pending... */
}
static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
{
struct drbd_conf *mdev = req->mdev;
if (!is_susp(mdev->state))
_req_may_be_done(req, m);
}
/*
* checks whether there was an overlapping request
* or ee already registered.
......@@ -380,10 +391,11 @@ static int _req_conflicts(struct drbd_request *req)
* and it enforces that we have to think in a very structured manner
* about the "events" that may happen to a request during its life time ...
*/
void __req_mod(struct drbd_request *req, enum drbd_req_event what,
int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m)
{
struct drbd_conf *mdev = req->mdev;
int rv = 0;
m->bio = NULL;
switch (what) {
......@@ -420,7 +432,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
req->rq_state &= ~RQ_LOCAL_PENDING;
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
......@@ -429,7 +441,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~RQ_LOCAL_PENDING;
__drbd_chk_io_error(mdev, FALSE);
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
......@@ -437,7 +449,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* it is legal to fail READA */
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
......@@ -455,7 +467,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* no point in retrying if there is no good remote data,
* or we have no connection. */
if (mdev->state.pdsk != D_UP_TO_DATE) {
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
break;
}
......@@ -517,11 +529,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
req->epoch = mdev->newest_tle->br_number;
list_add_tail(&req->tl_requests,
&mdev->newest_tle->requests);
/* increment size of current epoch */
mdev->newest_tle->n_req++;
mdev->newest_tle->n_writes++;
/* queue work item to send data */
D_ASSERT(req->rq_state & RQ_NET_PENDING);
......@@ -530,7 +540,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
drbd_queue_work(&mdev->data.work, &req->w);
/* close the epoch, in case it outgrew the limit */
if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
queue_barrier(mdev);
break;
......@@ -543,7 +553,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~RQ_NET_QUEUED;
/* if we did it right, tl_clear should be scheduled only after
* this, so this should not be necessary! */
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
break;
case handed_over_to_network:
......@@ -568,7 +578,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
* "completed_ok" events came in, once we return from
* _drbd_send_zc_bio (drbd_send_dblock), we have to check
* whether it is done already, and end it. */
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
break;
case read_retry_remote_canceled:
......@@ -584,7 +594,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* if it is still queued, we may not complete it here.
* it will be canceled soon. */
if (!(req->rq_state & RQ_NET_QUEUED))
_req_may_be_done(req, m);
_req_may_be_done(req, m); /* Allowed while state.susp */
break;
case write_acked_by_peer_and_sis:
......@@ -619,7 +629,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
D_ASSERT(req->rq_state & RQ_NET_PENDING);
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
break;
case neg_acked:
......@@ -629,11 +639,50 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
req->rq_state |= RQ_NET_DONE;
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
/* else: done by handed_over_to_network */
break;
case fail_frozen_disk_io:
if (!(req->rq_state & RQ_LOCAL_COMPLETED))
break;
_req_may_be_done(req, m); /* Allowed while state.susp */
break;
case restart_frozen_disk_io:
if (!(req->rq_state & RQ_LOCAL_COMPLETED))
break;
req->rq_state &= ~RQ_LOCAL_COMPLETED;
rv = MR_READ;
if (bio_data_dir(req->master_bio) == WRITE)
rv = MR_WRITE;
get_ldev(mdev);
req->w.cb = w_restart_disk_io;
drbd_queue_work(&mdev->data.work, &req->w);
break;
case resend:
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
before the connection loss (B&C only); only P_BARRIER_ACK was missing.
Trowing them out of the TL here by pretending we got a BARRIER_ACK
We ensure that the peer was not rebooted */
if (!(req->rq_state & RQ_NET_OK)) {
if (req->w.cb) {
drbd_queue_work(&mdev->data.work, &req->w);
rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
}
break;
}
/* else, fall through to barrier_acked */
case barrier_acked:
if (!(req->rq_state & RQ_WRITE))
break;
if (req->rq_state & RQ_NET_PENDING) {
/* barrier came in before all requests have been acked.
* this is bad, because if the connection is lost now,
......@@ -643,7 +692,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
}
D_ASSERT(req->rq_state & RQ_NET_SENT);
req->rq_state |= RQ_NET_DONE;
_req_may_be_done(req, m);
_req_may_be_done(req, m); /* Allowed while state.susp */
break;
case data_received:
......@@ -651,9 +700,11 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
_req_may_be_done(req, m);
_req_may_be_done_not_susp(req, m);
break;
};
return rv;
}
/* we may do a local read if:
......@@ -752,14 +803,16 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
* resync extent to finish, and, if necessary, pulls in the target
* extent into the activity log, which involves further disk io because
* of transactional on-disk meta data updates. */
if (rw == WRITE && local)
if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
req->rq_state |= RQ_IN_ACT_LOG;
drbd_al_begin_io(mdev, sector);
}
remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
(mdev->state.pdsk == D_INCONSISTENT &&
mdev->state.conn >= C_CONNECTED));
if (!(local || remote) && !mdev->state.susp) {
if (!(local || remote) && !is_susp(mdev->state)) {
dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
goto fail_free_complete;
}
......@@ -785,7 +838,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
/* GOOD, everything prepared, grab the spin_lock */
spin_lock_irq(&mdev->req_lock);
if (mdev->state.susp) {
if (is_susp(mdev->state)) {
/* If we got suspended, use the retry mechanism of
generic_make_request() to restart processing of this
bio. In the next call to drbd_make_request_26
......@@ -867,30 +920,10 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
/* check this request on the collision detection hash tables.
* if we have a conflict, just complete it here.
* THINK do we want to check reads, too? (I don't think so...) */
if (rw == WRITE && _req_conflicts(req)) {
/* this is a conflicting request.
* even though it may have been only _partially_
* overlapping with one of the currently pending requests,
* without even submitting or sending it, we will
* pretend that it was successfully served right now.
*/
if (local) {
bio_put(req->private_bio);
req->private_bio = NULL;
drbd_al_complete_io(mdev, req->sector);
put_ldev(mdev);
local = 0;
}
if (remote)
dec_ap_pending(mdev);
_drbd_end_io_acct(mdev, req);
/* THINK: do we want to fail it (-EIO), or pretend success? */
bio_endio(req->master_bio, 0);
req->master_bio = NULL;
dec_ap_bio(mdev);
drbd_req_free(req);
remote = 0;
}
if (rw == WRITE && _req_conflicts(req))
goto fail_conflicting;
list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
/* NOTE remote first: to get the concurrent write detection right,
* we must register the request before start of local IO. */
......@@ -923,6 +956,21 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
return 0;
fail_conflicting:
/* this is a conflicting request.
* even though it may have been only _partially_
* overlapping with one of the currently pending requests,
* without even submitting or sending it, we will
* pretend that it was successfully served right now.
*/
_drbd_end_io_acct(mdev, req);
spin_unlock_irq(&mdev->req_lock);
if (remote)
dec_ap_pending(mdev);
/* THINK: do we want to fail it (-EIO), or pretend success?
* this pretends success. */
err = 0;
fail_free_complete:
if (rw == WRITE && local)
drbd_al_complete_io(mdev, sector);
......@@ -961,21 +1009,6 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
return 1;
}
/*
* Paranoia: we might have been primary, but sync target, or
* even diskless, then lost the connection.
* This should have been handled (panic? suspend?) somewhere
* else. But maybe it was not, so check again here.
* Caution: as long as we do not have a read/write lock on mdev,
* to serialize state changes, this is racy, since we may lose
* the connection *after* we test for the cstate.
*/
if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
return 1;
}
return 0;
}
......
......@@ -104,6 +104,9 @@ enum drbd_req_event {
read_ahead_completed_with_error,
write_completed_with_error,
completed_ok,
resend,
fail_frozen_disk_io,
restart_frozen_disk_io,
nothing, /* for tracing only */
};
......@@ -183,6 +186,12 @@ enum drbd_req_state_bits {
/* keep this last, its for the RQ_NET_MASK */
__RQ_NET_MAX,
/* Set when this is a write, clear for a read */
__RQ_WRITE,
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
......@@ -201,6 +210,16 @@ enum drbd_req_state_bits {
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
/* For waking up the frozen transfer log mod_req() has to return if the request
should be counted in the epoch object*/
#define MR_WRITE_SHIFT 0
#define MR_WRITE (1 << MR_WRITE_SHIFT)
#define MR_READ_SHIFT 1
#define MR_READ (1 << MR_READ_SHIFT)
/* epoch entries */
static inline
struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
......@@ -244,30 +263,36 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
return NULL;
}
static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
{
struct bio *bio;
bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
req->private_bio = bio;
bio->bi_private = req;
bio->bi_end_io = drbd_endio_pri;
bio->bi_next = NULL;
}
static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
struct bio *bio;
struct drbd_request *req =
mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (likely(req)) {
bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
drbd_req_make_private_bio(req, bio_src);
req->rq_state = 0;
req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->mdev = mdev;
req->master_bio = bio_src;
req->private_bio = bio;
req->epoch = 0;
req->sector = bio->bi_sector;
req->size = bio->bi_size;
req->sector = bio_src->bi_sector;
req->size = bio_src->bi_size;
req->start_time = jiffies;
INIT_HLIST_NODE(&req->colision);
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
bio->bi_private = req;
bio->bi_end_io = drbd_endio_pri;
bio->bi_next = NULL;
}
return req;
}
......@@ -292,36 +317,43 @@ struct bio_and_error {
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m);
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
int rv;
/* __req_mod possibly frees req, do not touch req after that! */
__req_mod(req, what, &m);
rv = __req_mod(req, what, &m);
if (m.bio)
complete_master_bio(mdev, &m);
return rv;
}
/* completion of master bio is outside of spinlock.
* If you need it irqsave, do it your self! */
static inline void req_mod(struct drbd_request *req,
static inline int req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
int rv;
spin_lock_irq(&mdev->req_lock);
__req_mod(req, what, &m);
rv = __req_mod(req, what, &m);
spin_unlock_irq(&mdev->req_lock);
if (m.bio)
complete_master_bio(mdev, &m);
return rv;
}
#endif
......@@ -39,8 +39,6 @@
#include "drbd_int.h"
#include "drbd_req.h"
#define SLEEP_TIME (HZ/10)
static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
......@@ -217,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error)
*/
void drbd_endio_pri(struct bio *bio, int error)
{
unsigned long flags;
struct drbd_request *req = bio->bi_private;
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
......@@ -246,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error)
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
spin_lock_irqsave(&mdev->req_lock, flags);
__req_mod(req, what, &m);
spin_unlock_irqrestore(&mdev->req_lock, flags);
if (m.bio)
complete_master_bio(mdev, &m);
req_mod(req, what);
}
int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
......@@ -376,54 +367,145 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
struct drbd_epoch_entry *e;
if (!get_ldev(mdev))
return 0;
return -EIO;
if (drbd_rs_should_slow_down(mdev))
goto defer;
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
if (!e)
goto fail;
goto defer;
e->w.cb = w_e_send_csum;
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
e->w.cb = w_e_send_csum;
atomic_add(size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
return 1;
return 0;
/* drbd_submit_ee currently fails for one reason only:
* not being able to allocate enough bios.
* Is dropping the connection going to help? */
spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list);
spin_unlock_irq(&mdev->req_lock);
drbd_free_ee(mdev, e);
fail:
defer:
put_ldev(mdev);
return 2;
return -EAGAIN;
}
void resync_timer_fn(unsigned long data)
{
unsigned long flags;
struct drbd_conf *mdev = (struct drbd_conf *) data;
int queue;
spin_lock_irqsave(&mdev->req_lock, flags);
if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
queue = 1;
if (mdev->state.conn == C_VERIFY_S)
switch (mdev->state.conn) {
case C_VERIFY_S:
mdev->resync_work.cb = w_make_ov_request;
else
break;
case C_SYNC_TARGET:
mdev->resync_work.cb = w_make_resync_request;
} else {
break;
default:
queue = 0;
mdev->resync_work.cb = w_resync_inactive;
}
spin_unlock_irqrestore(&mdev->req_lock, flags);
/* harmless race: list_empty outside data.work.q_lock */
if (list_empty(&mdev->resync_work.list) && queue)
drbd_queue_work(&mdev->data.work, &mdev->resync_work);
}
static void fifo_set(struct fifo_buffer *fb, int value)
{
int i;
for (i = 0; i < fb->size; i++)
fb->values[i] = value;
}
static int fifo_push(struct fifo_buffer *fb, int value)
{
int ov;
ov = fb->values[fb->head_index];
fb->values[fb->head_index++] = value;
if (fb->head_index >= fb->size)
fb->head_index = 0;
return ov;
}
static void fifo_add_val(struct fifo_buffer *fb, int value)
{
int i;
for (i = 0; i < fb->size; i++)
fb->values[i] += value;
}
int drbd_rs_controller(struct drbd_conf *mdev)
{
unsigned int sect_in; /* Number of sectors that came in since the last turn */
unsigned int want; /* The number of sectors we want in the proxy */
int req_sect; /* Number of sectors to request in this turn */
int correction; /* Number of sectors more we need in the proxy*/
int cps; /* correction per invocation of drbd_rs_controller() */
int steps; /* Number of time steps to plan ahead */
int curr_corr;
int max_sect;
sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
mdev->rs_in_flight -= sect_in;
spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
} else { /* normal path */
want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
}
correction = want - mdev->rs_in_flight - mdev->rs_planed;
/* Plan ahead */
cps = correction / steps;
fifo_add_val(&mdev->rs_plan_s, cps);
mdev->rs_planed += cps * steps;
/* What we do in this step */
curr_corr = fifo_push(&mdev->rs_plan_s, 0);
spin_unlock(&mdev->peer_seq_lock);
mdev->rs_planed -= curr_corr;
req_sect = sect_in + curr_corr;
if (req_sect < 0)
req_sect = 0;
max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
if (req_sect > max_sect)
req_sect = max_sect;
/*
dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
sect_in, mdev->rs_in_flight, want, correction,
steps, cps, mdev->rs_planed, curr_corr, req_sect);
*/
return req_sect;
}
int w_make_resync_request(struct drbd_conf *mdev,
struct drbd_work *w, int cancel)
{
......@@ -431,8 +513,9 @@ int w_make_resync_request(struct drbd_conf *mdev,
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
int max_segment_size;
int number, i, size, pe, mx;
int number, rollback_i, size, pe, mx;
int align, queued, sndbuf;
int i = 0;
if (unlikely(cancel))
return 1;
......@@ -446,6 +529,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
dev_err(DEV, "%s in w_make_resync_request\n",
drbd_conn_str(mdev->state.conn));
if (mdev->rs_total == 0) {
/* empty resync? */
drbd_resync_finished(mdev);
return 1;
}
if (!get_ldev(mdev)) {
/* Since we only need to access mdev->rsync a
get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
......@@ -458,11 +547,25 @@ int w_make_resync_request(struct drbd_conf *mdev,
/* starting with drbd 8.3.8, we can handle multi-bio EEs,
* if it should be necessary */
max_segment_size = mdev->agreed_pro_version < 94 ?
queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
max_segment_size =
mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) :
mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE;
number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ);
pe = atomic_read(&mdev->rs_pending_cnt);
if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
} else {
mdev->c_sync_rate = mdev->sync_conf.rate;
number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
}
/* Throttle resync on lower level disk activity, which may also be
* caused by application IO on Primary/SyncTarget.
* Keep this after the call to drbd_rs_controller, as that assumes
* to be called as precisely as possible every SLEEP_TIME,
* and would be confused otherwise. */
if (drbd_rs_should_slow_down(mdev))
goto requeue;
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket)
......@@ -476,6 +579,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
mx = number;
/* Limit the number of pending RS requests to no more than the peer's receive buffer */
pe = atomic_read(&mdev->rs_pending_cnt);
if ((pe + number) > mx) {
number = mx - pe;
}
......@@ -526,6 +630,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
* be prepared for all stripe sizes of software RAIDs.
*/
align = 1;
rollback_i = i;
for (;;) {
if (size + BM_BLOCK_SIZE > max_segment_size)
break;
......@@ -561,14 +666,19 @@ int w_make_resync_request(struct drbd_conf *mdev,
size = (capacity-sector)<<9;
if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
switch (read_for_csum(mdev, sector, size)) {
case 0: /* Disk failure*/
case -EIO: /* Disk failure */
put_ldev(mdev);
return 0;
case 2: /* Allocation failed */
case -EAGAIN: /* allocation failed, or ldev busy */
drbd_rs_complete_io(mdev, sector);
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
i = rollback_i;
goto requeue;
/* case 1: everything ok */
case 0:
/* everything ok */
break;
default:
BUG();
}
} else {
inc_rs_pending(mdev);
......@@ -595,6 +705,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
}
requeue:
mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
put_ldev(mdev);
return 1;
......@@ -670,6 +781,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
return 1;
}
static void ping_peer(struct drbd_conf *mdev)
{
clear_bit(GOT_PING_ACK, &mdev->flags);
request_ping(mdev);
wait_event(mdev->misc_wait,
test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
}
int drbd_resync_finished(struct drbd_conf *mdev)
{
unsigned long db, dt, dbdt;
......@@ -709,6 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
if (!get_ldev(mdev))
goto out;
ping_peer(mdev);
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
......@@ -801,6 +922,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
mdev->rs_paused = 0;
mdev->ov_start_sector = 0;
drbd_md_sync(mdev);
if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
......@@ -817,9 +940,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
{
if (drbd_ee_has_active_page(e)) {
/* This might happen if sendpage() has not finished */
int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
atomic_add(i, &mdev->pp_in_use_by_net);
atomic_sub(i, &mdev->pp_in_use);
spin_lock_irq(&mdev->req_lock);
list_add_tail(&e->w.list, &mdev->net_ee);
spin_unlock_irq(&mdev->req_lock);
wake_up(&drbd_pp_wait);
} else
drbd_free_ee(mdev, e);
}
......@@ -926,9 +1053,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return 1;
}
if (get_ldev(mdev)) {
drbd_rs_complete_io(mdev, e->sector);
put_ldev(mdev);
}
di = (struct digest_info *)(unsigned long)e->block_id;
di = e->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
......@@ -952,7 +1082,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
} else {
inc_rs_pending(mdev);
e->block_id = ID_SYNCER;
e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
kfree(di);
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
}
} else {
......@@ -962,9 +1094,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
dec_unacked(mdev);
kfree(di);
move_to_net_ee_or_free(mdev, e);
if (unlikely(!ok))
......@@ -1034,9 +1163,12 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
if (get_ldev(mdev)) {
drbd_rs_complete_io(mdev, e->sector);
put_ldev(mdev);
}
di = (struct digest_info *)(unsigned long)e->block_id;
di = e->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
......@@ -1055,9 +1187,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
dec_unacked(mdev);
kfree(di);
if (!eq)
drbd_ov_oos_found(mdev, e->sector, e->size);
else
......@@ -1108,7 +1237,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* dec_ap_pending will be done in got_BarrierAck
* or (on connection loss) in w_clear_epoch. */
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
(struct p_header *)p, sizeof(*p), 0);
(struct p_header80 *)p, sizeof(*p), 0);
drbd_put_data_sock(mdev);
return ok;
......@@ -1173,6 +1302,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return ok;
}
int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
drbd_al_begin_io(mdev, req->sector);
/* Calling drbd_al_begin_io() out of the worker might deadlocks
theoretically. Practically it can not deadlock, since this is
only used when unfreezing IOs. All the extents of the requests
that made it into the TL are already active */
drbd_req_make_private_bio(req, req->master_bio);
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
generic_make_request(req->private_bio);
return 1;
}
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
struct drbd_conf *odev = mdev;
......@@ -1298,14 +1445,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
return retcode;
}
static void ping_peer(struct drbd_conf *mdev)
{
clear_bit(GOT_PING_ACK, &mdev->flags);
request_ping(mdev);
wait_event(mdev->misc_wait,
test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
}
/**
* drbd_start_resync() - Start the resync process
* @mdev: DRBD device.
......@@ -1379,13 +1518,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
r = SS_UNKNOWN_ERROR;
if (r == SS_SUCCESS) {
mdev->rs_total =
mdev->rs_mark_left = drbd_bm_total_weight(mdev);
unsigned long tw = drbd_bm_total_weight(mdev);
unsigned long now = jiffies;
int i;
mdev->rs_failed = 0;
mdev->rs_paused = 0;
mdev->rs_start =
mdev->rs_mark_time = jiffies;
mdev->rs_same_csum = 0;
mdev->rs_last_events = 0;
mdev->rs_last_sect_ev = 0;
mdev->rs_total = tw;
mdev->rs_start = now;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
mdev->rs_mark_left[i] = tw;
mdev->rs_mark_time[i] = now;
}
_drbd_pause_after(mdev);
}
write_unlock_irq(&global_state_lock);
......@@ -1397,12 +1544,31 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
(unsigned long) mdev->rs_total);
if (mdev->rs_total == 0) {
/* Peer still reachable? Beware of failing before-resync-target handlers! */
ping_peer(mdev);
if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
/* This still has a race (about when exactly the peers
* detect connection loss) that can lead to a full sync
* on next handshake. In 8.3.9 we fixed this with explicit
* resync-finished notifications, but the fix
* introduces a protocol change. Sleeping for some
* time longer than the ping interval + timeout on the
* SyncSource, to give the SyncTarget the chance to
* detect connection loss, then waiting for a ping
* response (implicit in drbd_resync_finished) reduces
* the race considerably, but does not solve it. */
if (side == C_SYNC_SOURCE)
schedule_timeout_interruptible(
mdev->net_conf->ping_int * HZ +
mdev->net_conf->ping_timeo*HZ/9);
drbd_resync_finished(mdev);
}
atomic_set(&mdev->rs_sect_in, 0);
atomic_set(&mdev->rs_sect_ev, 0);
mdev->rs_in_flight = 0;
mdev->rs_planed = 0;
spin_lock(&mdev->peer_seq_lock);
fifo_set(&mdev->rs_plan_s, 0);
spin_unlock(&mdev->peer_seq_lock);
/* ns.conn may already be != mdev->state.conn,
* we may have been paused in between, or become paused until
* the timer triggers.
......
......@@ -258,8 +258,8 @@ static int irqdma_allocated;
#include <linux/completion.h>
static struct request *current_req;
static struct request_queue *floppy_queue;
static void do_fd_request(struct request_queue *q);
static int set_next_request(void);
#ifndef fd_get_dma_residue
#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
......@@ -413,6 +413,7 @@ static struct gendisk *disks[N_DRIVE];
static struct block_device *opened_bdev[N_DRIVE];
static DEFINE_MUTEX(open_lock);
static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
static int fdc_queue;
/*
* This struct defines the different floppy types.
......@@ -890,8 +891,8 @@ static void unlock_fdc(void)
del_timer(&fd_timeout);
cont = NULL;
clear_bit(0, &fdc_busy);
if (current_req || blk_peek_request(floppy_queue))
do_fd_request(floppy_queue);
if (current_req || set_next_request())
do_fd_request(current_req->q);
spin_unlock_irqrestore(&floppy_lock, flags);
wake_up(&fdc_wait);
}
......@@ -2243,8 +2244,8 @@ static void floppy_end_request(struct request *req, int error)
* logical buffer */
static void request_done(int uptodate)
{
struct request_queue *q = floppy_queue;
struct request *req = current_req;
struct request_queue *q;
unsigned long flags;
int block;
char msg[sizeof("request done ") + sizeof(int) * 3];
......@@ -2258,6 +2259,8 @@ static void request_done(int uptodate)
return;
}
q = req->q;
if (uptodate) {
/* maintain values for invalidation on geometry
* change */
......@@ -2811,6 +2814,28 @@ static int make_raw_rw_request(void)
return 2;
}
/*
* Round-robin between our available drives, doing one request from each
*/
static int set_next_request(void)
{
struct request_queue *q;
int old_pos = fdc_queue;
do {
q = disks[fdc_queue]->queue;
if (++fdc_queue == N_DRIVE)
fdc_queue = 0;
if (q) {
current_req = blk_fetch_request(q);
if (current_req)
break;
}
} while (fdc_queue != old_pos);
return current_req != NULL;
}
static void redo_fd_request(void)
{
int drive;
......@@ -2822,17 +2847,17 @@ static void redo_fd_request(void)
do_request:
if (!current_req) {
struct request *req;
int pending;
spin_lock_irq(floppy_queue->queue_lock);
req = blk_fetch_request(floppy_queue);
spin_unlock_irq(floppy_queue->queue_lock);
if (!req) {
spin_lock_irq(&floppy_lock);
pending = set_next_request();
spin_unlock_irq(&floppy_lock);
if (!pending) {
do_floppy = NULL;
unlock_fdc();
return;
}
current_req = req;
}
drive = (long)current_req->rq_disk->private_data;
set_fdc(drive);
......@@ -4165,6 +4190,13 @@ static int __init floppy_init(void)
goto out_put_disk;
}
disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
if (!disks[dr]->queue) {
err = -ENOMEM;
goto out_put_disk;
}
blk_queue_max_hw_sectors(disks[dr]->queue, 64);
disks[dr]->major = FLOPPY_MAJOR;
disks[dr]->first_minor = TOMINOR(dr);
disks[dr]->fops = &floppy_fops;
......@@ -4183,13 +4215,6 @@ static int __init floppy_init(void)
if (err)
goto out_unreg_blkdev;
floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
if (!floppy_queue) {
err = -ENOMEM;
goto out_unreg_driver;
}
blk_queue_max_hw_sectors(floppy_queue, 64);
blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
floppy_find, NULL, NULL);
......@@ -4317,7 +4342,6 @@ static int __init floppy_init(void)
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
disks[drive]->queue = floppy_queue;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
disks[drive]->driverfs_dev = &floppy_device[drive].dev;
add_disk(disks[drive]);
......@@ -4333,8 +4357,6 @@ static int __init floppy_init(void)
floppy_release_irq_and_dma();
out_unreg_region:
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
blk_cleanup_queue(floppy_queue);
out_unreg_driver:
platform_driver_unregister(&floppy_driver);
out_unreg_blkdev:
unregister_blkdev(FLOPPY_MAJOR, "fd");
......@@ -4342,6 +4364,8 @@ static int __init floppy_init(void)
while (dr--) {
del_timer(&motor_off_timer[dr]);
put_disk(disks[dr]);
if (disks[dr]->queue)
blk_cleanup_queue(disks[dr]->queue);
}
return err;
}
......@@ -4550,11 +4574,11 @@ static void __exit floppy_module_exit(void)
platform_device_unregister(&floppy_device[drive]);
}
put_disk(disks[drive]);
blk_cleanup_queue(disks[drive]->queue);
}
del_timer_sync(&fd_timeout);
del_timer_sync(&fd_timer);
blk_cleanup_queue(floppy_queue);
if (atomic_read(&usage_count))
floppy_release_irq_and_dma();
......
......@@ -74,6 +74,7 @@
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <asm/uaccess.h>
......@@ -738,6 +739,103 @@ static inline int is_loop_device(struct file *file)
return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
}
/* loop sysfs attributes */
static ssize_t loop_attr_show(struct device *dev, char *page,
ssize_t (*callback)(struct loop_device *, char *))
{
struct loop_device *l, *lo = NULL;
mutex_lock(&loop_devices_mutex);
list_for_each_entry(l, &loop_devices, lo_list)
if (disk_to_dev(l->lo_disk) == dev) {
lo = l;
break;
}
mutex_unlock(&loop_devices_mutex);
return lo ? callback(lo, page) : -EIO;
}
#define LOOP_ATTR_RO(_name) \
static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \
static ssize_t loop_attr_do_show_##_name(struct device *d, \
struct device_attribute *attr, char *b) \
{ \
return loop_attr_show(d, b, loop_attr_##_name##_show); \
} \
static struct device_attribute loop_attr_##_name = \
__ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
{
ssize_t ret;
char *p = NULL;
mutex_lock(&lo->lo_ctl_mutex);
if (lo->lo_backing_file)
p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
mutex_unlock(&lo->lo_ctl_mutex);
if (IS_ERR_OR_NULL(p))
ret = PTR_ERR(p);
else {
ret = strlen(p);
memmove(buf, p, ret);
buf[ret++] = '\n';
buf[ret] = 0;
}
return ret;
}
static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
{
return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
}
static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
{
return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
}
static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
{
int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
return sprintf(buf, "%s\n", autoclear ? "1" : "0");
}
LOOP_ATTR_RO(backing_file);
LOOP_ATTR_RO(offset);
LOOP_ATTR_RO(sizelimit);
LOOP_ATTR_RO(autoclear);
static struct attribute *loop_attrs[] = {
&loop_attr_backing_file.attr,
&loop_attr_offset.attr,
&loop_attr_sizelimit.attr,
&loop_attr_autoclear.attr,
NULL,
};
static struct attribute_group loop_attribute_group = {
.name = "loop",
.attrs= loop_attrs,
};
static int loop_sysfs_init(struct loop_device *lo)
{
return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
&loop_attribute_group);
}
static void loop_sysfs_exit(struct loop_device *lo)
{
sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
&loop_attribute_group);
}
static int loop_set_fd(struct loop_device *lo, fmode_t mode,
struct block_device *bdev, unsigned int arg)
{
......@@ -837,6 +935,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
loop_sysfs_init(lo);
/* let user-space know about the new size */
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
......@@ -855,6 +954,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
return 0;
out_clr:
loop_sysfs_exit(lo);
lo->lo_thread = NULL;
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
......@@ -951,6 +1051,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
set_capacity(lo->lo_disk, 0);
if (bdev) {
bd_set_size(bdev, 0);
loop_sysfs_exit(lo);
/* let user-space know about this change */
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
}
......
......@@ -53,10 +53,10 @@
extern const char *drbd_buildtag(void);
#define REL_VERSION "8.3.8.1"
#define REL_VERSION "8.3.9rc2"
#define API_VERSION 88
#define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 94
#define PRO_VERSION_MAX 95
enum drbd_io_error_p {
......@@ -91,6 +91,11 @@ enum drbd_after_sb_p {
ASB_VIOLENTLY
};
enum drbd_on_no_data {
OND_IO_ERROR,
OND_SUSPEND_IO
};
/* KEEP the order, do not delete or insert. Only append. */
enum drbd_ret_codes {
ERR_CODE_BASE = 100,
......@@ -140,6 +145,7 @@ enum drbd_ret_codes {
ERR_CONNECTED = 151, /* DRBD 8.3 only */
ERR_PERM = 152,
ERR_NEED_APV_93 = 153,
ERR_STONITH_AND_PROT_A = 154,
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
......@@ -226,13 +232,17 @@ union drbd_state {
unsigned conn:5 ; /* 17/32 cstates */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned susp:1 ; /* 2/2 IO suspended no/yes */
unsigned susp:1 ; /* 2/2 IO suspended no/yes (by user) */
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned peer_isp:1 ;
unsigned user_isp:1 ;
unsigned _pad:11; /* 0 unused */
unsigned susp_nod:1 ; /* IO suspended because no data */
unsigned susp_fen:1 ; /* IO suspended because fence peer handler runs*/
unsigned _pad:9; /* 0 unused */
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned _pad:11; /* 0 unused */
unsigned _pad:9;
unsigned susp_fen:1 ;
unsigned susp_nod:1 ;
unsigned user_isp:1 ;
unsigned peer_isp:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
......@@ -312,6 +322,8 @@ enum drbd_timeout_flag {
#define DRBD_MAGIC 0x83740267
#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
#define DRBD_MAGIC_BIG 0x835a
#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG)
/* these are of type "int" */
#define DRBD_MD_INDEX_INTERNAL -1
......
......@@ -128,26 +128,31 @@
#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
#define DRBD_MAX_BIO_BVECS_MIN 0
#define DRBD_MAX_BIO_BVECS_MAX 128
#define DRBD_MAX_BIO_BVECS_DEF 0
#define DRBD_DP_VOLUME_MIN 4
#define DRBD_DP_VOLUME_MAX 1048576
#define DRBD_DP_VOLUME_DEF 16384
#define DRBD_C_PLAN_AHEAD_MIN 0
#define DRBD_C_PLAN_AHEAD_MAX 300
#define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */
#define DRBD_DP_INTERVAL_MIN 1
#define DRBD_DP_INTERVAL_MAX 600
#define DRBD_DP_INTERVAL_DEF 5
#define DRBD_C_DELAY_TARGET_MIN 1
#define DRBD_C_DELAY_TARGET_MAX 100
#define DRBD_C_DELAY_TARGET_DEF 10
#define DRBD_RS_THROTTLE_TH_MIN 1
#define DRBD_RS_THROTTLE_TH_MAX 600
#define DRBD_RS_THROTTLE_TH_DEF 20
#define DRBD_C_FILL_TARGET_MIN 0
#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */
#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */
#define DRBD_RS_HOLD_OFF_TH_MIN 1
#define DRBD_RS_HOLD_OFF_TH_MAX 6000
#define DRBD_RS_HOLD_OFF_TH_DEF 100
#define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */
#define DRBD_C_MAX_RATE_MAX (4 << 20)
#define DRBD_C_MAX_RATE_DEF 102400
#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */
#define DRBD_C_MIN_RATE_MAX (4 << 20)
#define DRBD_C_MIN_RATE_DEF 4096
#undef RANGE
#endif
......@@ -87,6 +87,12 @@ NL_PACKET(syncer_conf, 8,
NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
NL_BIT( 65, T_MAY_IGNORE, use_rle)
NL_INTEGER( 75, T_MAY_IGNORE, on_no_data)
NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead)
NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target)
NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target)
NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate)
NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate)
)
NL_PACKET(invalidate, 9, )
......
......@@ -83,7 +83,7 @@ static inline int ddebug_remove_module(const char *mod)
#define dynamic_pr_debug(fmt, ...) \
do { if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); } while (0)
#define dynamic_dev_dbg(dev, format, ...) \
#define dynamic_dev_dbg(dev, fmt, ...) \
do { if (0) dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); } while (0)
#endif
......
......@@ -743,6 +743,7 @@
#define PCI_DEVICE_ID_HP_CISSC 0x3230
#define PCI_DEVICE_ID_HP_CISSD 0x3238
#define PCI_DEVICE_ID_HP_CISSE 0x323a
#define PCI_DEVICE_ID_HP_CISSF 0x323b
#define PCI_DEVICE_ID_HP_ZX2_IOC 0x4031
#define PCI_VENDOR_ID_PCTECH 0x1042
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment