Browse Source

Merge branch 'timer-retry' into 'next'

make federation retry timer-based

Closes #142

See merge request famedly/conduit!181
merge-requests/181/merge
Jonathan de Jong 4 years ago
parent
commit
8e7c272081
  1. 4
      src/database.rs
  2. 283
      src/database/sending.rs
  3. 9
      src/error.rs

4
src/database.rs

@ -211,6 +211,7 @@ impl Database {
let (admin_sender, admin_receiver) = mpsc::unbounded(); let (admin_sender, admin_receiver) = mpsc::unbounded();
let (sending_sender, sending_receiver) = mpsc::unbounded(); let (sending_sender, sending_receiver) = mpsc::unbounded();
let (waking_sender, waking_receiver) = mpsc::unbounded();
let db = Arc::new(TokioRwLock::from(Self { let db = Arc::new(TokioRwLock::from(Self {
_db: builder.clone(), _db: builder.clone(),
@ -328,6 +329,7 @@ impl Database {
servercurrentevent_data: builder.open_tree("servercurrentevent_data")?, servercurrentevent_data: builder.open_tree("servercurrentevent_data")?,
maximum_requests: Arc::new(Semaphore::new(config.max_concurrent_requests as usize)), maximum_requests: Arc::new(Semaphore::new(config.max_concurrent_requests as usize)),
sender: sending_sender, sender: sending_sender,
waker: waking_sender,
}, },
admin: admin::Admin { admin: admin::Admin {
sender: admin_sender, sender: admin_sender,
@ -751,7 +753,7 @@ impl Database {
guard.admin.start_handler(Arc::clone(&db), admin_receiver); guard.admin.start_handler(Arc::clone(&db), admin_receiver);
guard guard
.sending .sending
.start_handler(Arc::clone(&db), sending_receiver); .start_handler(Arc::clone(&db), sending_receiver, waking_receiver);
drop(guard); drop(guard);

283
src/database/sending.rs

@ -2,6 +2,7 @@ use std::{
collections::{BTreeMap, HashMap, HashSet}, collections::{BTreeMap, HashMap, HashSet},
convert::{TryFrom, TryInto}, convert::{TryFrom, TryInto},
fmt::Debug, fmt::Debug,
future::Future,
sync::Arc, sync::Arc,
time::{Duration, Instant}, time::{Duration, Instant},
}; };
@ -34,9 +35,9 @@ use ruma::{
}; };
use tokio::{ use tokio::{
select, select,
sync::{RwLock, Semaphore}, sync::{oneshot, RwLock, Semaphore},
}; };
use tracing::{error, warn}; use tracing::{debug, error, warn};
use super::abstraction::Tree; use super::abstraction::Tree;
@ -73,6 +74,21 @@ impl OutgoingKind {
prefix prefix
} }
fn describe(&self) -> std::result::Result<String, std::string::FromUtf8Error> {
Ok(match self {
OutgoingKind::Appservice(appservice_id) => {
format!("Appservice (ID {})", appservice_id)
}
OutgoingKind::Push(user, _) => {
let user = utils::string_from_bytes(user)?;
format!("User Push Service (for {})", user)
}
OutgoingKind::Normal(server) => {
format!("Matrix Server ({})", server)
}
})
}
} }
#[derive(Clone, Debug, PartialEq, Eq, Hash)] #[derive(Clone, Debug, PartialEq, Eq, Hash)]
@ -88,26 +104,59 @@ pub struct Sending {
pub(super) servercurrentevent_data: Arc<dyn Tree>, // ServerCurrentEvents = (+ / $)ServerName / UserId + PduId / Id (for edus), Data = EDU content pub(super) servercurrentevent_data: Arc<dyn Tree>, // ServerCurrentEvents = (+ / $)ServerName / UserId + PduId / Id (for edus), Data = EDU content
pub(super) maximum_requests: Arc<Semaphore>, pub(super) maximum_requests: Arc<Semaphore>,
pub sender: mpsc::UnboundedSender<(Vec<u8>, Vec<u8>)>, pub sender: mpsc::UnboundedSender<(Vec<u8>, Vec<u8>)>,
pub(super) waker: mpsc::UnboundedSender<OutgoingKind>,
} }
enum TransactionStatus { enum TransactionStatus {
// Currently running, in outgoing loop
Running, Running,
Failed(u32, Instant), // number of times failed, time of last failure Retrying {
Retrying(u32), // number of times failed tries: u32,
},
// Retrying, in timers loop
Failed {
tries: u32,
wake: Option<oneshot::Sender<()>>,
},
}
fn make_timeout(
at: Instant,
outgoing: Vec<u8>,
) -> (oneshot::Sender<()>, impl Future<Output = Vec<u8>>) {
let (tx, rx) = oneshot::channel();
let at = tokio::time::Instant::from_std(at);
(tx, async move {
let _ = tokio::time::timeout_at(at, rx).await;
outgoing
})
} }
impl Sending { impl Sending {
pub fn start_handler( pub(super) fn start_handler(
&self, &self,
db: Arc<RwLock<Database>>,
receiver: mpsc::UnboundedReceiver<(Vec<u8>, Vec<u8>)>,
waker: mpsc::UnboundedReceiver<OutgoingKind>,
) {
tokio::spawn(Self::handler(db, receiver, waker));
}
async fn handler(
db: Arc<RwLock<Database>>, db: Arc<RwLock<Database>>,
mut receiver: mpsc::UnboundedReceiver<(Vec<u8>, Vec<u8>)>, mut receiver: mpsc::UnboundedReceiver<(Vec<u8>, Vec<u8>)>,
mut waker: mpsc::UnboundedReceiver<OutgoingKind>,
) { ) {
tokio::spawn(async move { let mut outgoing = FuturesUnordered::new();
let mut futures = FuturesUnordered::new();
let mut timers = FuturesUnordered::new();
let mut current_transaction_status = HashMap::<Vec<u8>, TransactionStatus>::new(); let mut current_transaction_status = HashMap::<Vec<u8>, TransactionStatus>::new();
// Retry requests we could not finish yet // Retry requests we could not finish yet
{
let mut initial_transactions = HashMap::<OutgoingKind, Vec<SendingEventType>>::new(); let mut initial_transactions = HashMap::<OutgoingKind, Vec<SendingEventType>>::new();
let guard = db.read().await; let guard = db.read().await;
@ -143,29 +192,66 @@ impl Sending {
for (outgoing_kind, events) in initial_transactions { for (outgoing_kind, events) in initial_transactions {
current_transaction_status current_transaction_status
.insert(outgoing_kind.get_prefix(), TransactionStatus::Running); .insert(outgoing_kind.get_prefix(), TransactionStatus::Running);
futures.push(Self::handle_events( outgoing.push(Self::handle_events(
outgoing_kind.clone(), outgoing_kind.clone(),
events, events,
Arc::clone(&db), Arc::clone(&db),
)); ));
} }
}
loop { loop {
select! { select! {
Some(response) = futures.next() => { // New transactions to be sent out (from server/user activity)
Some((key, value)) = receiver.next() => {
match Self::parse_servercurrentevent(&key, value) {
Ok((outgoing_kind, event)) => {
let guard = db.read().await;
match Self::prepare_transaction(
&outgoing_kind,
vec![(event, key)],
&mut current_transaction_status,
&guard,
false,
) {
Ok(Some(events)) => {
outgoing.push(Self::handle_events(outgoing_kind, events, Arc::clone(&db)));
}
Ok(None) => {
debug!("prepare_transaction produced not ready for {:?} when receiving event", outgoing_kind)
}
Err(err) => {
error!("prepare_transaction produced an error for {:?}; {}", outgoing_kind, err)
}
}
},
Err(err) => {
error!("parse_servercurrentevent produced an error for {:?}; {}", key, err)
}
}
},
// Result of outgoing transactions
Some(response) = outgoing.next() => {
match response { match response {
// Outgoing transaction succeeded
Ok(outgoing_kind) => { Ok(outgoing_kind) => {
let guard = db.read().await; let guard = db.read().await;
let sending = &guard.sending;
// Clear all outgoing events that were just sent out.
let prefix = outgoing_kind.get_prefix(); let prefix = outgoing_kind.get_prefix();
for (key, _) in guard.sending.servercurrentevent_data for (key, _) in sending.servercurrentevent_data
.scan_prefix(prefix.clone()) .scan_prefix(prefix.clone())
{ {
guard.sending.servercurrentevent_data.remove(&key).unwrap(); sending.servercurrentevent_data.remove(&key).unwrap();
} }
// Find events that have been added since starting the last request // Find events that have been added since starting the last request
let new_events = guard.sending.servernameevent_data let new_events = sending.servernameevent_data
.scan_prefix(prefix.clone()) .scan_prefix(prefix.clone())
.filter_map(|(k, v)| { .filter_map(|(k, v)| {
Self::parse_servercurrentevent(&k, v).ok().map(|ev| (ev, k)) Self::parse_servercurrentevent(&k, v).ok().map(|ev| (ev, k))
@ -179,13 +265,14 @@ impl Sending {
// Insert pdus we found // Insert pdus we found
for (e, key) in &new_events { for (e, key) in &new_events {
let value = if let SendingEventType::Edu(value) = &e.1 { &**value } else { &[] }; let value = if let SendingEventType::Edu(value) = &e.1 { &**value } else { &[] };
guard.sending.servercurrentevent_data.insert(&key, value).unwrap(); sending.servercurrentevent_data.insert(&key, value).unwrap();
guard.sending.servernameevent_data.remove(&key).unwrap(); sending.servernameevent_data.remove(&key).unwrap();
} }
drop(guard); // Clear retries
current_transaction_status.insert(prefix, TransactionStatus::Running);
futures.push( outgoing.push(
Self::handle_events( Self::handle_events(
outgoing_kind.clone(), outgoing_kind.clone(),
new_events.into_iter().map(|(event, _)| event.1).collect(), new_events.into_iter().map(|(event, _)| event.1).collect(),
@ -196,74 +283,150 @@ impl Sending {
current_transaction_status.remove(&prefix); current_transaction_status.remove(&prefix);
} }
} }
Err((outgoing_kind, _)) => {
current_transaction_status.entry(outgoing_kind.get_prefix()).and_modify(|e| *e = match e { // Outgoing transaction failed
TransactionStatus::Running => TransactionStatus::Failed(1, Instant::now()), Err((outgoing_kind, err)) => {
TransactionStatus::Retrying(n) => TransactionStatus::Failed(*n+1, Instant::now()), // Set status to Failed, create timer
TransactionStatus::Failed(_, _) => { let timer = Self::generate_next_timer(&mut current_transaction_status, outgoing_kind.get_prefix());
error!("Request that was not even running failed?!");
return // Add timer to loop
timers.push(timer);
let destination = match outgoing_kind.describe() {
Ok(d) => d,
Err(err) => {
warn!("Could not describe {:?}: {}", outgoing_kind, err);
continue;
}, },
}); };
warn!("Outgoing request to {} failed: {}", destination, err);
} }
}; };
}, },
Some((key, value)) = receiver.next() => {
if let Ok((outgoing_kind, event)) = Self::parse_servercurrentevent(&key, value) { // Transaction retry timers firing
Some(prefix) = timers.next() => {
match Self::parse_servercurrentevent(&prefix, vec![]) {
Ok((outgoing_kind, _)) => {
let guard = db.read().await; let guard = db.read().await;
if let Ok(Some(events)) = Self::select_events( // Transition Failed => Retrying, return pending old transaction events
match Self::prepare_transaction(
&outgoing_kind, &outgoing_kind,
vec![(event, key)], vec![], // will be ignored because retry == true anyways
&mut current_transaction_status, &mut current_transaction_status,
&guard &guard,
true,
) { ) {
futures.push(Self::handle_events(outgoing_kind, events, Arc::clone(&db))); Ok(Some(events)) => {
outgoing.push(Self::handle_events(outgoing_kind, events, Arc::clone(&db)));
} }
Ok(None) => {
// Unreachable because retry == true
unreachable!("prepare_transaction went over to Ok(None), while being fired by timer (for {:?})", outgoing_kind)
} }
Err(err) => {
error!("prepare_transaction produced an error for {:?}; {}", outgoing_kind, err)
} }
} }
},
Err(err) => {
error!("parse_servercurrentevent produced an error for {:?}; {}", prefix, err)
} }
});
} }
},
#[tracing::instrument(skip(outgoing_kind, new_events, current_transaction_status, db))] // Explicit wakeups, makes an in-flight timer return immediately
fn select_events( Some(outgoing) = waker.next() => {
outgoing_kind: &OutgoingKind, let prefix = outgoing.get_prefix();
new_events: Vec<(SendingEventType, Vec<u8>)>, // Events we want to send: event and full key if let Some(TransactionStatus::Failed { wake, .. }) = current_transaction_status.get_mut(&prefix) {
current_transaction_status: &mut HashMap<Vec<u8>, TransactionStatus>, if let Some(wake) = wake.take() {
db: &Database, let _ = wake.send(());
) -> Result<Option<Vec<SendingEventType>>> { }
let mut retry = false; }
let mut allow = true; },
}
}
}
let prefix = outgoing_kind.get_prefix(); /// Generates timer/oneshot, alters status to reflect Failed
let entry = current_transaction_status.entry(prefix.clone()); ///
/// Returns timer/oneshot future to wake up loop for next retry
fn generate_next_timer(
status: &mut HashMap<Vec<u8>, TransactionStatus>,
prefix: Vec<u8>,
) -> impl Future<Output = Vec<u8>> {
let now = Instant::now();
entry let entry = status
.and_modify(|e| match e { .get_mut(&prefix)
TransactionStatus::Running | TransactionStatus::Retrying(_) => { .expect("guaranteed to be set before this function");
allow = false; // already running
} let new_tries = match entry {
TransactionStatus::Failed(tries, time) => { // Running -> Failed
// Fail if a request has failed recently (exponential backoff) TransactionStatus::Running => 1,
let mut min_elapsed_duration = Duration::from_secs(30) * (*tries) * (*tries); // Retrying -> Failed
if min_elapsed_duration > Duration::from_secs(60 * 60 * 24) { TransactionStatus::Retrying { tries } => *tries + 1,
min_elapsed_duration = Duration::from_secs(60 * 60 * 24);
// The transition of Failed -> Retrying is handled in another loop, this is impossible
TransactionStatus::Failed { .. } => {
unreachable!("Request that was not even running failed?!")
} }
};
if time.elapsed() < min_elapsed_duration { const ONE_DAY: Duration = Duration::from_secs(60 * 60 * 24);
allow = false;
} else { // Exponential backoff, clamp upper value to one day
retry = true; let next_wakeup = now + (Duration::from_secs(30) * new_tries * new_tries).min(ONE_DAY);
*e = TransactionStatus::Retrying(*tries);
let (wake, fut) = make_timeout(next_wakeup, prefix);
*entry = TransactionStatus::Failed {
tries: new_tries,
wake: Some(wake),
};
fut
} }
/// Wake up any timers for a particular outgoing kind.
pub fn wake_outgoing(&self, outgoing: OutgoingKind) {
let _ = self.waker.unbounded_send(outgoing);
} }
})
.or_insert(TransactionStatus::Running);
if !allow { #[tracing::instrument(skip(outgoing_kind, new_events, status, db))]
fn prepare_transaction(
outgoing_kind: &OutgoingKind,
new_events: Vec<(SendingEventType, Vec<u8>)>, // Events we want to send: event and full key
status: &mut HashMap<Vec<u8>, TransactionStatus>,
db: &Database,
retry: bool,
) -> Result<Option<Vec<SendingEventType>>> {
let prefix = outgoing_kind.get_prefix();
// If we're not retrying, and there's already an entry in the hashmap (retrying, failed, or running), bail.
//
// Transitioning from Failed to Retrying is not this function's responsibility, it's the timer loop's job.
//
// When retrying, it's asserted that the hashmap exists and has Failed, if it doesn't, its an error state.
if !retry && status.get(&prefix).is_some() {
return Ok(None); return Ok(None);
} else if retry {
// Retrying, asserting hashmap is Failed, bail otherwise.
let tries: u32 = match status.get(&prefix) {
Some(txn_status) => match txn_status {
TransactionStatus::Failed { tries, .. } => Ok(tries.clone()),
_ => Err(Error::bad_state("state hashmap entry was not Failed")),
},
None => Err(Error::bad_state("state hashmap entry was absent")),
}?;
// When transitioning Retrying => Failed, tries was already incremented, no need to do it here.
status.insert(prefix.clone(), TransactionStatus::Retrying { tries });
} else {
// Not retrying, hashmap didnt exist
status.insert(prefix.clone(), TransactionStatus::Running);
} }
let mut events = Vec::new(); let mut events = Vec::new();

9
src/error.rs

@ -59,10 +59,14 @@ pub enum Error {
#[error("{0}")] #[error("{0}")]
BadServerResponse(&'static str), BadServerResponse(&'static str),
#[error("{0}")] #[error("{0}")]
/// Don't create this directly. Use Error::bad_config instead.
BadConfig(&'static str), BadConfig(&'static str),
#[error("{0}")] #[error("{0}")]
/// Don't create this directly. Use Error::bad_database instead. /// Don't create this directly. Use Error::bad_database instead.
BadDatabase(&'static str), BadDatabase(&'static str),
#[error("{0}")]
/// Don't create this directly. Use Error::bad_state instead.
BadState(&'static str),
#[error("uiaa")] #[error("uiaa")]
Uiaa(UiaaInfo), Uiaa(UiaaInfo),
#[error("{0}: {1}")] #[error("{0}: {1}")]
@ -81,6 +85,11 @@ impl Error {
error!("BadConfig: {}", message); error!("BadConfig: {}", message);
Self::BadConfig(message) Self::BadConfig(message)
} }
pub fn bad_state(message: &'static str) -> Self {
error!("BadState: {}", message);
Self::BadState(message)
}
} }
impl Error { impl Error {

Loading…
Cancel
Save