Skip to content

Commit

Permalink
fix: watchdog race condition
Browse files Browse the repository at this point in the history
  • Loading branch information
mhuisi committed Jul 24, 2024
1 parent af40e61 commit 798703c
Showing 1 changed file with 21 additions and 8 deletions.
29 changes: 21 additions & 8 deletions src/Lean/Server/Watchdog.lean
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ section Utils
| crashed (e : IO.Error)
| ioError (e : IO.Error)

inductive CrashOrigin
| fileWorkerToClientForwarding
| clientToFileWorkerForwarding

inductive WorkerState where
/-- The watchdog can detect a crashed file worker in two places: When trying to send a message
to the file worker and when reading a request reply.
Expand All @@ -98,7 +102,7 @@ section Utils
that are in-flight are errored. Upon receiving the next packet for that file worker, the file
worker is restarted and the packet is forwarded to it. If the crash was detected while writing
a packet, we queue that packet until the next packet for the file worker arrives. -/
| crashed (queuedMsgs : Array JsonRpc.Message)
| crashed (queuedMsgs : Array JsonRpc.Message) (origin: CrashOrigin)
| running

abbrev PendingRequestMap := RBMap RequestID JsonRpc.Message compare
Expand Down Expand Up @@ -404,10 +408,10 @@ section ServerM
return
eraseFileWorker uri

def handleCrash (uri : DocumentUri) (queuedMsgs : Array JsonRpc.Message) : ServerM Unit := do
def handleCrash (uri : DocumentUri) (queuedMsgs : Array JsonRpc.Message) (origin: CrashOrigin) : ServerM Unit := do
let some fw ← findFileWorker? uri
| return
updateFileWorkers { fw with state := WorkerState.crashed queuedMsgs }
updateFileWorkers { fw with state := WorkerState.crashed queuedMsgs origin }

/-- Tries to write a message, sets the state of the FileWorker to `crashed` if it does not succeed
and restarts the file worker if the `crashed` flag was already set. Just logs an error if
Expand All @@ -423,7 +427,7 @@ section ServerM
let some fw ← findFileWorker? uri
| return
match fw.state with
| WorkerState.crashed queuedMsgs =>
| WorkerState.crashed queuedMsgs _ =>
let mut queuedMsgs := queuedMsgs
if queueFailedMessage then
queuedMsgs := queuedMsgs.push msg
Expand All @@ -442,7 +446,7 @@ section ServerM
catch _ =>
crashedMsgs := crashedMsgs.push msg
if ¬ crashedMsgs.isEmpty then
handleCrash uri crashedMsgs
handleCrash uri crashedMsgs .clientToFileWorkerForwarding
| WorkerState.running =>
let initialQueuedMsgs :=
if queueFailedMessage then
Expand All @@ -452,7 +456,7 @@ section ServerM
try
fw.stdin.writeLspMessage msg
catch _ =>
handleCrash uri initialQueuedMsgs
handleCrash uri initialQueuedMsgs .clientToFileWorkerForwarding

/--
Sends a notification to the file worker identified by `uri` that its dependency `staleDependency`
Expand Down Expand Up @@ -955,7 +959,16 @@ section MainLoop
let workers ← st.fileWorkersRef.get
let mut workerTasks := #[]
for (_, fw) in workers do
if let WorkerState.running := fw.state then
-- When the forwarding task crashes, its return value will be stuck at
-- `WorkerEvent.crashed _`.
-- We do not want to handle this event only once, not over and over again,
-- so once the state becomes `WorkerState.crashed _ .fileWorkerToClientForwarding`
-- as a result of `WorkerEvent.crashed _`, we stop handling this event until
-- eventually the file worker is restarted by a notification from the client.
-- We do not want to filter the forwarding task in case of
-- `WorkerState.crashed _ .clientToFileWorkerForwarding`, since the forwarding task
-- exit code may still contain valuable information in this case (e.g. that the imports changed).
if !(fw.state matches WorkerState.crashed _ .fileWorkerToClientForwarding) then
workerTasks := workerTasks.push <| fw.commTask.map (ServerEvent.workerEvent fw)

let ev ← IO.waitAny (clientTask :: workerTasks.toList)
Expand Down Expand Up @@ -984,7 +997,7 @@ section MainLoop
| WorkerEvent.ioError e =>
throwServerError s!"IO error while processing events for {fw.doc.uri}: {e}"
| WorkerEvent.crashed _ =>
handleCrash fw.doc.uri #[]
handleCrash fw.doc.uri #[] .fileWorkerToClientForwarding
mainLoop clientTask
| WorkerEvent.terminated =>
throwServerError <| "Internal server error: got termination event for worker that "
Expand Down

0 comments on commit 798703c

Please sign in to comment.