Skip to content

Commit 8cd5528

Browse files
committed
fixes for nodelist oos
turn off config patching, induce churn to the refute arrays wait 4 min before syncing data additional logging / log changes. added a second spot in digest cycle to patch the nodeListHash after we applyNodeListChange fix idea 2. syncV2 already has the nodelist so we don't want to transform it anymore more logs fix in initRefuteCyclesForNode a few more updates adjust logs / turn off debug code remove comment WIP on itn4-1.16.4rc1 format fix
1 parent 8385dcf commit 8cd5528

File tree

7 files changed

+78
-5
lines changed

7 files changed

+78
-5
lines changed

src/logger/index.ts

+4
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ export type LogFlags = {
120120
verboseNestedCounters: boolean // extra logging for nested counters
121121

122122
node_rotation_debug: boolean // extra logging for node rotation math
123+
124+
p2pSyncDebug: boolean
123125
}
124126

125127
export let logFlags: LogFlags = {
@@ -163,6 +165,8 @@ export let logFlags: LogFlags = {
163165
verboseNestedCounters: false,
164166

165167
node_rotation_debug: false,
168+
169+
p2pSyncDebug: false,
166170
}
167171

168172
const filePath1 = path.join(process.cwd(), 'data-logs', 'cycleRecords1.txt')

src/p2p/CycleChain.ts

+3
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ export function validate(
8181
info('validate: prev.standbylist', prev.standbyNodeListHash)
8282
info('validate: next.standbylist', next.standbyNodeListHash)
8383

84+
info('validate: prev.nodelist', prev.nodeListHash)
85+
info('validate: next.nodelist', next.nodeListHash)
86+
8487
if (next.previous !== prevMarker) {
8588
info('validate: ERROR: next.previous !== prevMarker')
8689
return false

src/p2p/NodeList.ts

+26-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,12 @@ const isRefuteCyclesEnabled = (cycle: P2P.CycleCreatorTypes.CycleRecord | null)
114114

115115
const initRefuteCyclesForNode = (node: P2P.NodeListTypes.Node, cycle: P2P.CycleCreatorTypes.CycleRecord | null) => {
116116
if (isRefuteCyclesEnabled(cycle)) {
117-
node.refuteCycles = []
117+
// This if check is a critical fix. This is because initRefuteCyclesForNode gets called
118+
// as part of addNode and addNode gets called in syncV2 after we download the nodelist and then
119+
// pass it to addNodes. having the check here will make sure we dont wipe out existing
120+
// data in refuteCycles that we just downloaded.
121+
if(node.refuteCycles == null)
122+
node.refuteCycles = []
118123
}
119124
}
120125

@@ -391,6 +396,26 @@ export function updateNodes(
391396
cycle: P2P.CycleCreatorTypes.CycleRecord | null
392397
) {
393398
for (const update of updates) updateNode(update, raiseEvents, cycle)
399+
400+
/* //LOCAL_OOS_TEST_SUPPORT
401+
let stats = {init:0, push:0}
402+
const refuteCyclesEnabled = isRefuteCyclesEnabled(cycle)
403+
// every cycle mess with every node
404+
for (const node of activeByIdOrder){
405+
if (refuteCyclesEnabled) {
406+
if(node.refuteCycles == null){
407+
node.refuteCycles = []
408+
stats.init++
409+
} else {
410+
node.refuteCycles.push(cycle.counter)
411+
stats.push++
412+
}
413+
}
414+
}
415+
416+
info(`NodeList.updateNodes: ${updates.length} ${Utils.safeStringify(stats)} enableProblematicNodeRemoval: ${config.p2p.enableProblematicNodeRemoval} cycle: ${cycle?.counter} refuteCyclesEnabled: ${refuteCyclesEnabled} enableProblematicNodeRemovalOnCycle: ${config.p2p.enableProblematicNodeRemovalOnCycle}`)
417+
*/
418+
394419
}
395420

396421
export function updateProblematicNodeTracking(cycle: P2P.CycleCreatorTypes.CycleRecord | null) {

src/p2p/Sync.ts

+14-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import util from 'util'
44
import * as http from '../http'
55
import { P2P } from '@shardus/types'
66
import { reversed, validateTypes } from '../utils'
7-
import { config, logger, network } from './Context'
7+
import { config, crypto, logger, network } from './Context'
88
import * as Archivers from './Archivers'
99
import * as CycleChain from './CycleChain'
1010
import * as CycleCreator from './CycleCreator'
@@ -366,11 +366,11 @@ export function digestCycle(cycle: P2P.CycleCreatorTypes.CycleRecord, source: st
366366
// standby list, but not with the validator and archivers lists
367367

368368
const newNodeListHash = NodeList.computeNewNodeListHash()
369-
if (source === 'syncV2' && newNodeListHash !== cycle.nodeListHash) warn(`sync:digestCycle source: ${source} cycle: ${cycle.counter} patching nodelisthash ${cycle.nodeListHash} -> ${newNodeListHash}`)
369+
if (newNodeListHash !== cycle.nodeListHash) warn(`sync:digestCycle source: ${source} cycle: ${cycle.counter} patching nodelisthash ${cycle.nodeListHash} -> ${newNodeListHash}`)
370370
cycle.nodeListHash = newNodeListHash
371371

372372
const newArchiverListHash = Archivers.computeNewArchiverListHash()
373-
if (source === 'syncV2' && newArchiverListHash !== cycle.archiverListHash) warn(`sync:digestCycle source: ${source} cycle: ${cycle.counter} patching archiverlisthash ${cycle.archiverListHash} -> ${newArchiverListHash}`)
373+
if (newArchiverListHash !== cycle.archiverListHash) warn(`sync:digestCycle source: ${source} cycle: ${cycle.counter} patching archiverlisthash ${cycle.archiverListHash} -> ${newArchiverListHash}`)
374374
cycle.archiverListHash = newArchiverListHash
375375

376376
// for join v2, also get the standby node list hash
@@ -419,9 +419,16 @@ export function digestCycle(cycle: P2P.CycleCreatorTypes.CycleRecord, source: st
419419
)} CycleCreator.currentCycle: ${CycleCreator.currentCycle}`
420420
)
421421

422+
422423
const changes = parse(cycle)
424+
423425
applyNodeListChange(changes, true, cycle)
424426

427+
if (logFlags.important_as_error) {
428+
const newNodeListHash = crypto.hash(NodeList.byJoinOrder) //computeNewNodeListHash not safe due to side effects
429+
warn(`sync:digestCycle after applyNodeListChange source: ${source} cycle: ${cycle.counter} prev nodelisthash ${cycle.nodeListHash} next ${newNodeListHash}`)
430+
}
431+
425432
// for join v2, also add any new standby nodes to the standy node list
426433
// and remove any standby nodes that have unjoined.
427434
if (config.p2p.useJoinProtocolV2) {
@@ -446,7 +453,10 @@ export function digestCycle(cycle: P2P.CycleCreatorTypes.CycleRecord, source: st
446453
}
447454

448455
CycleChain.append(cycle)
449-
info(`digestCycle: marker of cycle${cycle.counter} from ${source} after digest is ${CycleChain.computeCycleMarker(cycle)}`)
456+
const digestedCycleMarker = CycleChain.computeCycleMarker(cycle)
457+
info(`digestCycle: marker of cycle${cycle.counter} from ${source} after digest is ${digestedCycleMarker}`)
458+
459+
/* prettier-ignore */ if (logFlags.important_as_error) info(`digestCycle: cycle: ${Utils.safeStringify(cycle)}`)
450460

451461
// TODO: This seems like a possible location to inetvene if our node
452462
// is getting far behind on what it thinks the current cycle is

src/p2p/SyncV2/queries.ts

+15
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ export function getCycleDataFromNode(
156156
node: ActiveNode,
157157
expectedMarker: hexstring
158158
): ResultAsync<CycleRecord, Error> {
159+
info(`getCycleDataFromNode: expectedMarker: ${expectedMarker}`)
160+
159161
return attemptSimpleFetch(node, 'cycle-by-marker', {
160162
marker: expectedMarker,
161163
})
@@ -166,6 +168,8 @@ export function getValidatorListFromNode(
166168
node: ActiveNode,
167169
expectedHash: hexstring
168170
): ResultAsync<Validator[], Error> {
171+
info(`getValidatorListFromNode: expectedHash: ${expectedHash}`)
172+
169173
return attemptSimpleFetch(
170174
node,
171175
'validator-list',
@@ -181,6 +185,8 @@ export function getArchiverListFromNode(
181185
node: ActiveNode,
182186
expectedHash: hexstring
183187
): ResultAsync<Archiver[], Error> {
188+
info(`getArchiverListFromNode: expectedHash: ${expectedHash}`)
189+
184190
return attemptSimpleFetch(node, 'archiver-list', {
185191
hash: expectedHash,
186192
})
@@ -191,6 +197,8 @@ export function getStandbyNodeListFromNode(
191197
node: ActiveNode,
192198
expectedHash: hexstring
193199
): ResultAsync<JoinRequest[], Error> {
200+
info(`getStandbyNodeListFromNode: expectedHash: ${expectedHash}`)
201+
194202
return attemptSimpleFetch(
195203
node,
196204
'standby-list',
@@ -206,6 +214,8 @@ export function getTxListFromNode(
206214
node: ActiveNode,
207215
expectedHash: hexstring
208216
): ResultAsync<{ hash: string; tx: P2P.ServiceQueueTypes.AddNetworkTx }[], Error> {
217+
info(`getTxListFromNode: expectedHash: ${expectedHash}`)
218+
209219
return attemptSimpleFetch(
210220
node,
211221
'tx-list',
@@ -215,3 +225,8 @@ export function getTxListFromNode(
215225
10000 //TODO need to make this scale when there could be millions of entries
216226
)
217227
}
228+
229+
function info(...msg) {
230+
const entry = `SyncV2: ${msg.join(' ')}`
231+
p2pLogger.info(entry)
232+
}

src/p2p/SyncV2/verify.ts

+9
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import { HashableObject } from '../../crypto'
99
import { crypto } from '../Context'
1010
import { makeCycleMarker } from '../CycleCreator'
1111
import { Utils } from '@shardus/types'
12+
import { p2pLogger } from './queries'
13+
import { logFlags } from '../../logger'
1214

1315
/**
1416
* Verifies if the hash of a given object matches the expected hash.
@@ -39,6 +41,8 @@ export function verifyValidatorList(
3941
validatorList: P2P.NodeListTypes.Node[],
4042
expectedHash: hexstring
4143
): Result<boolean, Error> {
44+
45+
if(logFlags.p2pSyncDebug) info(`verifyValidatorList ${expectedHash} ${Utils.safeStringify(validatorList)} `);
4246
return verify(validatorList, expectedHash, 'validator list')
4347
}
4448

@@ -76,3 +80,8 @@ export function verifyTxList(
7680

7781
return ok(true)
7882
}
83+
84+
function info(...msg) {
85+
const entry = `SyncV2: ${msg.join(' ')}`
86+
p2pLogger.info(entry)
87+
}

src/shardus/index.ts

+7
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,14 @@ class Shardus extends EventEmitter {
772772
// todo hook this up later cant deal with it now.
773773
// await this.storage.deleteOldDBPath()
774774

775+
/* // LOCAL_OOS_TEST_SUPPORT not for production
776+
this.mainLogger.info('sync-p2p synced waiting 4 min')
777+
await utils.sleep(240000) //do not release this helps us have a chance
778+
//to query /config before the node syncs data
779+
*/
780+
this.mainLogger.info('sync-syncAppData')
775781
await this.syncAppData()
782+
776783
}
777784
})
778785
Self.emitter.on('restore', async (cycleNumber: number) => {

0 commit comments

Comments
 (0)