Commit 489d8e55 authored by Alexander Aring's avatar Alexander Aring Committed by David Teigland

fs: dlm: add reliable connection if reconnect

This patch introduce to make a tcp lowcomms connection reliable even if
reconnects occurs. This is done by an application layer re-transmission
handling and sequence numbers in dlm protocols. There are three new dlm
commands:

DLM_OPTS:

This will encapsulate an existing dlm message (and rcom message if they
don't have an own application side re-transmission handling). As optional
handling additional tlv's (type length fields) can be appended. This can
be for example a sequence number field. However because in DLM_OPTS the
lockspace field is unused and a sequence number is a mandatory field it
isn't made as a tlv and we put the sequence number inside the lockspace
id. The possibility to add optional options are still there for future
purposes.

DLM_ACK:

Just a dlm header to acknowledge the receive of a DLM_OPTS message to
it's sender.

DLM_FIN:

This provides a 4 way handshake for connection termination inclusive
support for half-closed connections. It's provided on application layer
because SCTP doesn't support half-closed sockets, the shutdown() call
can interrupted by e.g. TCP resets itself and a hard logic to implement
it because the othercon paradigm in lowcomms. The 4-way termination
handshake also solve problems to synchronize peer EOF arrival and that
the cluster manager removes the peer in the node membership handling of
DLM. In some cases messages can be still transmitted in this time and we
need to wait for the node membership event.

To provide a reliable connection the node will retransmit all
unacknowledges message to it's peer on reconnect. The receiver will then
filtering out the next received message and drop all messages which are
duplicates.

As RCOM_STATUS and RCOM_NAMES messages are the first messages which are
exchanged and they have they own re-transmission handling, there exists
logic that these messages must be first. If these messages arrives we
store the dlm version field. This handling is on DLM 3.1 and after this
patch 3.2 the same. A backwards compatibility handling has been added
which seems to work on tests without tcpkill, however it's not recommended
to use DLM 3.1 and 3.2 at the same time, because DLM 3.2 tries to fix long
term bugs in the DLM protocol.
Signed-off-by: default avatarAlexander Aring <aahringo@redhat.com>
Signed-off-by: default avatarDavid Teigland <teigland@redhat.com>
parent 8e2e4086
...@@ -371,18 +371,26 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) ...@@ -371,18 +371,26 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
/* dlm_header is first element of all structs sent between nodes */ /* dlm_header is first element of all structs sent between nodes */
#define DLM_HEADER_MAJOR 0x00030000 #define DLM_HEADER_MAJOR 0x00030000
#define DLM_HEADER_MINOR 0x00000001 #define DLM_HEADER_MINOR 0x00000002
#define DLM_VERSION_3_1 0x00030001
#define DLM_VERSION_3_2 0x00030002
#define DLM_HEADER_SLOTS 0x00000001 #define DLM_HEADER_SLOTS 0x00000001
#define DLM_MSG 1 #define DLM_MSG 1
#define DLM_RCOM 2 #define DLM_RCOM 2
#define DLM_OPTS 3
#define DLM_ACK 4
#define DLM_FIN 5
struct dlm_header { struct dlm_header {
uint32_t h_version; uint32_t h_version;
union { union {
/* for DLM_MSG and DLM_RCOM */ /* for DLM_MSG and DLM_RCOM */
uint32_t h_lockspace; uint32_t h_lockspace;
/* for DLM_ACK and DLM_OPTS */
uint32_t h_seq;
} u; } u;
uint32_t h_nodeid; /* nodeid of sender */ uint32_t h_nodeid; /* nodeid of sender */
uint16_t h_length; uint16_t h_length;
...@@ -390,7 +398,6 @@ struct dlm_header { ...@@ -390,7 +398,6 @@ struct dlm_header {
uint8_t h_pad; uint8_t h_pad;
}; };
#define DLM_MSG_REQUEST 1 #define DLM_MSG_REQUEST 1
#define DLM_MSG_CONVERT 2 #define DLM_MSG_CONVERT 2
#define DLM_MSG_UNLOCK 3 #define DLM_MSG_UNLOCK 3
...@@ -458,10 +465,29 @@ struct dlm_rcom { ...@@ -458,10 +465,29 @@ struct dlm_rcom {
char rc_buf[]; char rc_buf[];
}; };
struct dlm_opt_header {
uint16_t t_type;
uint16_t t_length;
uint32_t o_pad;
/* need to be 8 byte aligned */
char t_value[];
};
/* encapsulation header */
struct dlm_opts {
struct dlm_header o_header;
uint8_t o_nextcmd;
uint8_t o_pad;
uint16_t o_optlen;
uint32_t o_pad2;
char o_opts[];
};
union dlm_packet { union dlm_packet {
struct dlm_header header; /* common to other two */ struct dlm_header header; /* common to other two */
struct dlm_message message; struct dlm_message message;
struct dlm_rcom rcom; struct dlm_rcom rcom;
struct dlm_opts opts;
}; };
#define DLM_RSF_NEED_SLOTS 0x00000001 #define DLM_RSF_NEED_SLOTS 0x00000001
......
...@@ -567,7 +567,12 @@ static int new_lockspace(const char *name, const char *cluster, ...@@ -567,7 +567,12 @@ static int new_lockspace(const char *name, const char *cluster,
mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_requestqueue_mutex);
mutex_init(&ls->ls_clear_proc_locks); mutex_init(&ls->ls_clear_proc_locks);
ls->ls_recover_buf = kmalloc(LOWCOMMS_MAX_TX_BUFFER_LEN, GFP_NOFS); /* Due backwards compatibility with 3.1 we need to use maximum
* possible dlm message size to be sure the message will fit and
* not having out of bounds issues. However on sending side 3.2
* might send less.
*/
ls->ls_recover_buf = kmalloc(DEFAULT_BUFFER_SIZE, GFP_NOFS);
if (!ls->ls_recover_buf) if (!ls->ls_recover_buf)
goto out_lkbidr; goto out_lkbidr;
......
...@@ -1762,8 +1762,10 @@ static void process_send_sockets(struct work_struct *work) ...@@ -1762,8 +1762,10 @@ static void process_send_sockets(struct work_struct *work)
clear_bit(CF_WRITE_PENDING, &con->flags); clear_bit(CF_WRITE_PENDING, &con->flags);
if (test_and_clear_bit(CF_RECONNECT, &con->flags)) if (test_and_clear_bit(CF_RECONNECT, &con->flags)) {
close_connection(con, false, false, true); close_connection(con, false, false, true);
dlm_midcomms_unack_msg_resend(con->nodeid);
}
if (con->sock == NULL) { /* not mutex protected so check it inside too */ if (con->sock == NULL) { /* not mutex protected so check it inside too */
if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
......
...@@ -12,7 +12,12 @@ ...@@ -12,7 +12,12 @@
#ifndef __LOWCOMMS_DOT_H__ #ifndef __LOWCOMMS_DOT_H__
#define __LOWCOMMS_DOT_H__ #define __LOWCOMMS_DOT_H__
#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096 #include "dlm_internal.h"
#define DLM_MIDCOMMS_OPT_LEN sizeof(struct dlm_opts)
#define LOWCOMMS_MAX_TX_BUFFER_LEN (DEFAULT_BUFFER_SIZE - \
DLM_MIDCOMMS_OPT_LEN)
#define CONN_HASH_SIZE 32 #define CONN_HASH_SIZE 32
/* This is deliberately very simple because most clusters have simple /* This is deliberately very simple because most clusters have simple
......
This diff is collapsed.
...@@ -21,6 +21,7 @@ int dlm_midcomms_start(void); ...@@ -21,6 +21,7 @@ int dlm_midcomms_start(void);
void dlm_midcomms_shutdown(void); void dlm_midcomms_shutdown(void);
void dlm_midcomms_add_member(int nodeid); void dlm_midcomms_add_member(int nodeid);
void dlm_midcomms_remove_member(int nodeid); void dlm_midcomms_remove_member(int nodeid);
void dlm_midcomms_unack_msg_resend(int nodeid);
#endif /* __MIDCOMMS_DOT_H__ */ #endif /* __MIDCOMMS_DOT_H__ */
...@@ -202,7 +202,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) ...@@ -202,7 +202,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
allow_sync_reply(ls, &rc->rc_id); allow_sync_reply(ls, &rc->rc_id);
memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); memset(ls->ls_recover_buf, 0, DEFAULT_BUFFER_SIZE);
send_rcom_stateless(ls, msg, rc); send_rcom_stateless(ls, msg, rc);
...@@ -325,7 +325,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) ...@@ -325,7 +325,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
memcpy(rc->rc_buf, last_name, last_len); memcpy(rc->rc_buf, last_name, last_len);
allow_sync_reply(ls, &rc->rc_id); allow_sync_reply(ls, &rc->rc_id);
memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); memset(ls->ls_recover_buf, 0, DEFAULT_BUFFER_SIZE);
send_rcom_stateless(ls, msg, rc); send_rcom_stateless(ls, msg, rc);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment