-
Notifications
You must be signed in to change notification settings - Fork 45
/
homa_impl.h
1227 lines (1070 loc) · 39.3 KB
/
homa_impl.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* SPDX-License-Identifier: BSD-2-Clause */
/* This file contains definitions that are shared across the files
* that implement Homa for Linux.
*/
#ifndef _HOMA_IMPL_H
#define _HOMA_IMPL_H
#include <linux/bug.h>
#ifdef __UNIT_TEST__
#undef WARN
#define WARN(condition, format...)
#undef WARN_ON
#define WARN_ON(condition) ({ \
int __ret_warn_on = !!(condition); \
unlikely(__ret_warn_on); \
})
#undef WARN_ON_ONCE
#define WARN_ON_ONCE(condition) WARN_ON(condition)
#endif
#include <linux/audit.h>
#include <linux/icmp.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/proc_fs.h>
#include <linux/sched/signal.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/inet_common.h>
#include <net/gro.h>
#include <net/rps.h>
#ifdef __UNIT_TEST__
#undef alloc_pages
#define alloc_pages mock_alloc_pages
extern struct page *mock_alloc_pages(gfp_t gfp, unsigned int order);
#define compound_order mock_compound_order
extern unsigned int mock_compound_order(struct page *page);
#define cpu_to_node mock_cpu_to_node
extern int mock_cpu_to_node(int cpu);
#undef current
#define current current_task
extern struct task_struct *current_task;
#undef get_cycles
#define get_cycles mock_get_cycles
extern cycles_t mock_get_cycles(void);
#define get_page mock_get_page
extern void mock_get_page(struct page *page);
#undef kmalloc
#define kmalloc mock_kmalloc
extern void *mock_kmalloc(size_t size, gfp_t flags);
#undef kmalloc_array
#define kmalloc_array(count, size, type) mock_kmalloc(count*size, type)
#define kthread_complete_and_exit(comp, code)
#ifdef page_address
#undef page_address
#endif
#define page_address(page) ((void *) page)
#define page_ref_count mock_page_refs
extern int mock_page_refs(struct page *page);
#define page_to_nid mock_page_to_nid
extern int mock_page_to_nid(struct page *page);
#define put_page mock_put_page
extern void mock_put_page(struct page *page);
#define rcu_read_lock mock_rcu_read_lock
extern void mock_rcu_read_lock(void);
#define rcu_read_unlock mock_rcu_read_unlock
extern void mock_rcu_read_unlock(void);
#undef register_net_sysctl
#define register_net_sysctl mock_register_net_sysctl
extern struct ctl_table_header *mock_register_net_sysctl(struct net *net,
const char *path, struct ctl_table *table);
#define signal_pending(xxx) mock_signal_pending
extern int mock_signal_pending;
#define spin_unlock mock_spin_unlock
extern void mock_spin_unlock(spinlock_t *lock);
#undef vmalloc
#define vmalloc mock_vmalloc
extern void *mock_vmalloc(size_t size);
#undef DECLARE_PER_CPU
#define DECLARE_PER_CPU(type, name) extern type name[10];
#undef DEFINE_PER_CPU
#define DEFINE_PER_CPU(type, name) type name[10];
#undef per_cpu
#define per_cpu(name, core) (name[core])
#endif /* __UNIT_TEST__ */
/* Null out things that confuse VSCode Intellisense */
#ifdef __VSCODE__
#define raw_smp_processor_id() 1
#define BUG()
#define BUG_ON(...)
#define set_current_state(...)
#endif
/* Forward declarations. */
struct homa_peer;
struct homa_sock;
struct homa;
#include "homa.h"
#include "timetrace.h"
#include "homa_metrics.h"
/* Declarations used in this file, so they can't be made at the end. */
extern void homa_throttle_lock_slow(struct homa *homa);
#define sizeof32(type) ((int) (sizeof(type)))
/** define CACHE_LINE_SIZE - The number of bytes in a cache line. */
#define CACHE_LINE_SIZE 64
/**
* define HOMA_MAX_GRANTS - Used to size various data structures for grant
* management; the max_overcommit sysctl parameter must never be greater than
* this.
*/
#define HOMA_MAX_GRANTS 10
/**
* struct homa_cache_line - An object whose size equals that of a cache line.
*/
struct homa_cache_line {
char bytes[64];
};
/**
* struct homa_interest - Contains various information used while waiting
* for incoming messages (indicates what kinds of messages a particular
* thread is interested in receiving).
*/
struct homa_interest {
/**
* @thread: Thread that would like to receive a message. Will get
* woken up when a suitable message becomes available.
*/
struct task_struct *thread;
/**
* @ready_rpc: This is actually a (struct homa_rpc *) identifying the
* RPC that was found; NULL if no RPC has been found yet. This
* variable is used for synchronization to handoff the RPC, and
* must be set only after @locked is set.
*/
atomic_long_t ready_rpc;
/**
* @locked: Nonzero means that @ready_rpc is locked; only valid
* if @ready_rpc is non-NULL.
*/
int locked;
/**
* @core: Core on which @thread was executing when it registered
* its interest. Used for load balancing (see balance.txt).
*/
int core;
/**
* @reg_rpc: RPC whose @interest field points here, or
* NULL if none.
*/
struct homa_rpc *reg_rpc;
/**
* @request_links: For linking this object into
* &homa_sock.request_interests. The interest must not be linked
* on either this list or @response_links if @id is nonzero.
*/
struct list_head request_links;
/**
* @response_links: For linking this object into
* &homa_sock.request_interests.
*/
struct list_head response_links;
};
/**
* homa_interest_init() - Fill in default values for all of the fields
* of a struct homa_interest.
* @interest: Struct to initialize.
*/
static inline void homa_interest_init(struct homa_interest *interest)
{
interest->thread = current;
atomic_long_set(&interest->ready_rpc, 0);
interest->locked = 0;
interest->core = raw_smp_processor_id();
interest->reg_rpc = NULL;
interest->request_links.next = LIST_POISON1;
interest->response_links.next = LIST_POISON1;
}
/**
* enum homa_freeze_type - The @type argument to homa_freeze must be
* one of these values.
*/
enum homa_freeze_type {
RESTART_RPC = 1,
PEER_TIMEOUT = 2,
SLOW_RPC = 3,
SOCKET_CLOSE = 4,
PACKET_LOST = 5,
NEED_ACK_MISSING_DATA = 6,
};
/**
* struct homa - Overall information about the Homa protocol implementation.
*
* There will typically only exist one of these at a time, except during
* unit tests.
*/
struct homa {
/**
* @next_outgoing_id: Id to use for next outgoing RPC request.
* This is always even: it's used only to generate client-side ids.
* Accessed without locks.
*/
atomic64_t next_outgoing_id;
/**
* @link_idle_time: The time, measured by get_cycles() at which we
* estimate that all of the packets we have passed to Linux for
* transmission will have been transmitted. May be in the past.
* This estimate assumes that only Homa is transmitting data, so
* it could be a severe underestimate if there is competing traffic
* from, say, TCP. Access only with atomic ops.
*/
atomic64_t link_idle_time __aligned(CACHE_LINE_SIZE);
/**
* @grantable_lock: Used to synchronize access to grant-related
* fields below, from @grantable_peers to @last_grantable_change.
*/
spinlock_t grantable_lock __aligned(CACHE_LINE_SIZE);
/**
* @grantable_lock_time: get_cycles() time when grantable_lock
* was last locked.
*/
__u64 grantable_lock_time;
/**
* @grant_recalc_count: Incremented every time homa_grant_recalc
* starts a new recalculation; used to avoid unnecessary
* recalculations in other threads. If a thread sees this value
* change, it knows that someone else is recalculating grants.
*/
atomic_t grant_recalc_count;
/**
* @grantable_peers: Contains all peers with entries in their
* grantable_rpcs lists. The list is sorted in priority order of
* the highest priority RPC for each peer (fewer ungranted bytes ->
* higher priority).
*/
struct list_head grantable_peers;
/**
* @grantable_rpcs: Contains all RPCs that have not been fully
* granted. The list is sorted in priority order (fewer ungranted
* bytes -> higher priority).
*/
struct list_head grantable_rpcs;
/** @num_grantable_rpcs: The number of RPCs in grantable_rpcs. */
int num_grantable_rpcs;
/** @last_grantable_change: The get_cycles time of the most recent
* increment or decrement of num_grantable_rpcs; used for computing
* statistics.
*/
__u64 last_grantable_change;
/**
* @max_grantable_rpcs: The largest value that has been seen for
* num_grantable_rpcs since this value was reset to 0 (it can be
* reset externally using sysctl).
*/
int max_grantable_rpcs;
/**
* @oldest_rpc: The RPC with incoming data whose start_cycles is
* farthest in the past). NULL means either there are no incoming
* RPCs or the oldest needs to be recomputed. Must hold grantable_lock
* to update.
*/
struct homa_rpc *oldest_rpc;
/**
* @grant_window: How many bytes of granted but not yet received data
* may exist for an RPC at any given time.
*/
int grant_window;
/**
* @num_active_rpcs: number of entries in @active_rpcs and
* @active_remaining that are currently used.
*/
int num_active_rpcs;
/**
* @active_rpcs: pointers to all of the RPCs that we will grant to
* right now. Slot 0 is highest priority.
*/
struct homa_rpc *active_rpcs[HOMA_MAX_GRANTS];
/**
* @bytes_remaining: entry i in this array contains a copy of
* active_rpcs[i]->msgin.bytes_remaining. These values can be
* updated by the corresponding RPCs without holding the grantable
* lock. Perfect consistency isn't required; this is used only to
* detect when the priority ordering of messages changes.
*/
atomic_t active_remaining[HOMA_MAX_GRANTS];
/**
* @grant_nonfifo: How many bytes should be granted using the
* normal priority system between grants to the oldest message.
*/
int grant_nonfifo;
/**
* @grant_nonfifo_left: Counts down bytes using the normal
* priority mechanism. When this reaches zero, it's time to grant
* to the old message.
*/
int grant_nonfifo_left;
/**
* @pacer_mutex: Ensures that only one instance of homa_pacer_xmit
* runs at a time. Only used in "try" mode: never block on this.
*/
spinlock_t pacer_mutex __aligned(CACHE_LINE_SIZE);
/**
* @pacer_fifo_fraction: The fraction of time (in thousandths) when
* the pacer should transmit next from the oldest message, rather
* than the highest-priority message. Set externally via sysctl.
*/
int pacer_fifo_fraction;
/**
* @pacer_fifo_count: When this becomes <= zero, it's time for the
* pacer to allow the oldest RPC to transmit.
*/
int pacer_fifo_count;
/**
* @pacer_start: get_cycles() time when the pacer last woke up
* (if the pacer is running) or 0 if the pacer is sleeping.
*/
__u64 pacer_wake_time;
/**
* @throttle_lock: Used to synchronize access to @throttled_rpcs. To
* insert or remove an RPC from throttled_rpcs, must first acquire
* the RPC's socket lock, then this lock.
*/
spinlock_t throttle_lock;
/**
* @throttled_rpcs: Contains all homa_rpcs that have bytes ready
* for transmission, but which couldn't be sent without exceeding
* the queue limits for transmission. Manipulate only with "_rcu"
* functions.
*/
struct list_head throttled_rpcs;
/**
* @throttle_add: The get_cycles() time when the most recent RPC
* was added to @throttled_rpcs.
*/
__u64 throttle_add;
/**
* @throttle_min_bytes: If a packet has fewer bytes than this, then it
* bypasses the throttle mechanism and is transmitted immediately.
* We have this limit because for very small packets we can't keep
* up with the NIC (we're limited by CPU overheads); there's no
* need for throttling and going through the throttle mechanism
* adds overhead, which slows things down. At least, that's the
* hypothesis (needs to be verified experimentally!). Set externally
* via sysctl.
*/
int throttle_min_bytes;
/**
* @total_incoming: the total number of bytes that we expect to receive
* (across all messages) even if we don't send out any more grants
* (includes granted but unreceived bytes, plus unreceived unscheduled
* bytes that we know about). This can potentially be negative, if
* a peer sends more bytes than granted (see synchronization note in
* homa_send_grants for why we have to allow this possibility).
*/
atomic_t total_incoming __aligned(CACHE_LINE_SIZE);
/**
* @next_client_port: A client port number to consider for the
* next Homa socket; increments monotonically. Current value may
* be in the range allocated for servers; must check before using.
* This port may also be in use already; must check.
*/
__u16 next_client_port __aligned(CACHE_LINE_SIZE);
/**
* @port_map: Information about all open sockets. Dynamically
* allocated; must be kfreed.
*/
struct homa_socktab *port_map __aligned(CACHE_LINE_SIZE);
/**
* @peertab: Info about all the other hosts we have communicated with.
* Dynamically allocated; must be kfreed.
*/
struct homa_peertab *peers;
/**
* @page_pool_mutex: Synchronizes access to any/all of the page_pools
* used for outgoing sk_buff data.
*/
spinlock_t page_pool_mutex __aligned(CACHE_LINE_SIZE);
/**
* @page_pools: One page pool for each NUMA node on the machine.
* If there are no cores for node, then this value is NULL.
*/
struct homa_page_pool *page_pools[MAX_NUMNODES];
/** @max_numa: Highest NUMA node id in use by any core. */
int max_numa;
/**
* @skb_page_frees_per_sec: Rate at which to return pages from sk_buff
* page pools back to Linux. This is the total rate across all pools.
* Set externally via sysctl.
*/
int skb_page_frees_per_sec;
/**
* @skb_pages_to_free: Space in which to collect pages that are
* about to be released. Dynamically allocated.
*/
struct page **skb_pages_to_free;
/**
* @pages_to_free_slot: Maximum number of pages that can be
* stored in skb_pages_to_free;
*/
int pages_to_free_slots;
/**
* @skb_page_free_time: Time (in get_cycles() units) when the
* next sk_buff page should be freed. Could be in the past.
*/
__u64 skb_page_free_time;
/**
* @skb_page_pool_min_mb: Don't return pages from a pool to Linux
* if the amount of cached data in the pool has been less than this
* many KBytes at any time in the recent past. Set externally via
* sysctl.
*/
int skb_page_pool_min_kb;
/**
* @unsched_bytes: The number of bytes that may be sent in a
* new message without receiving any grants. There used to be a
* variable rtt_bytes that served this purpose, and was also used
* for window. Historically, rtt_bytes was intended to be the amount
* of data that can be transmitted over the wire in the time it
* takes to send a full-size data packet and receive back a grant.
* But, for fast networks that value could result in too much
* buffer utilization (and, we wanted to have separate values for
* @unsched_bytes and @window). Set externally via sysctl.
*/
int unsched_bytes;
/**
* @window_param: Set externally via sysctl to select a policy for
* computing homa-grant_window. If 0 then homa->grant_window is
* computed dynamically based on the number of RPCs we're currently
* granting to. If nonzero then homa->grant_window will always be the
* same as @window_param.
*/
int window_param;
/**
* @link_bandwidth: The raw bandwidth of the network uplink, in
* units of 1e06 bits per second. Set externally via sysctl.
*/
int link_mbps;
/**
* @poll_usecs: Amount of time (in microseconds) that a thread
* will spend busy-waiting for an incoming messages before
* going to sleep. Set externally via sysctl.
*/
int poll_usecs;
/**
* @poll_cycles: The value of @poll_usecs in the units returned
* by get_cycles().
*/
int poll_cycles;
/**
* @num_priorities: The total number of priority levels available for
* Homa's use. Internally, Homa will use priorities from 0 to
* num_priorities-1, inclusive. Set externally via sysctl.
*/
int num_priorities;
/**
* @priority_map: entry i gives the value to store in the high-order
* 3 bits of the DSCP field of IP headers to implement priority level
* i. Set externally via sysctl.
*/
int priority_map[HOMA_MAX_PRIORITIES];
/**
* @max_sched_prio: The highest priority level currently available for
* scheduled packets. Levels above this are reserved for unscheduled
* packets. Set externally via sysctl.
*/
int max_sched_prio;
/**
* @unsched_cutoffs: the current priority assignments for incoming
* unscheduled packets. The value of entry i is the largest
* message size that uses priority i (larger i is higher priority).
* If entry i has a value of HOMA_MAX_MESSAGE_SIZE or greater, then
* priority levels less than i will not be used for unscheduled
* packets. At least one entry in the array must have a value of
* HOMA_MAX_MESSAGE_SIZE or greater (entry 0 is usually INT_MAX).
* Set externally via sysctl.
*/
int unsched_cutoffs[HOMA_MAX_PRIORITIES];
/**
* @cutoff_version: increments every time unsched_cutoffs is
* modified. Used to determine when we need to send updates to
* peers. Note: 16 bits should be fine for this: the worst
* that happens is a peer has a super-stale value that equals
* our current value, so the peer uses suboptimal cutoffs until the
* next version change. Can be set externally via sysctl.
*/
int cutoff_version;
/**
* @fifo_grant_increment: how many additional bytes to grant in
* a "pity" grant sent to the oldest outstanding message. Set
* externally via sysctl.
*/
int fifo_grant_increment;
/**
* @grant_fifo_fraction: The fraction (in thousandths) of granted
* bytes that should go to the *oldest* incoming message, rather
* than the highest priority ones. Set externally via sysctl.
*/
int grant_fifo_fraction;
/**
* @max_overcommit: The maximum number of messages to which Homa will
* send grants at any given point in time. Set externally via sysctl.
*/
int max_overcommit;
/**
* @max_incoming: Homa will try to ensure that the total number of
* bytes senders have permission to send to this host (either
* unscheduled bytes or granted bytes) does not exceeds this value.
* Set externally via sysctl.
*/
int max_incoming;
/**
* @max_rpcs_per_peer: If there are multiple incoming messages from
* the same peer, Homa will only issue grants to this many of them
* at a time. Set externally via sysctl.
*/
int max_rpcs_per_peer;
/**
* @resend_ticks: When an RPC's @silent_ticks reaches this value,
* start sending RESEND requests.
*/
int resend_ticks;
/**
* @resend_interval: minimum number of homa timer ticks between
* RESENDs for the same RPC.
*/
int resend_interval;
/**
* @timeout_ticks: abort an RPC if its silent_ticks reaches this value.
*/
int timeout_ticks;
/**
* @timeout_resends: Assume that a server is dead if it has not
* responded after this many RESENDs have been sent to it.
*/
int timeout_resends;
/**
* @request_ack_ticks: How many timer ticks we'll wait for the
* client to ack an RPC before explicitly requesting an ack.
* Set externally via sysctl.
*/
int request_ack_ticks;
/**
* @reap_limit: Maximum number of packet buffers to free in a
* single call to home_rpc_reap.
*/
int reap_limit;
/**
* @dead_buffs_limit: If the number of packet buffers in dead but
* not yet reaped RPCs is less than this number, then Homa reaps
* RPCs in a way that minimizes impact on performance but may permit
* dead RPCs to accumulate. If the number of dead packet buffers
* exceeds this value, then Homa switches to a more aggressive approach
* to reaping RPCs. Set externally via sysctl.
*/
int dead_buffs_limit;
/**
* @max_dead_buffs: The largest aggregate number of packet buffers
* in dead (but not yet reaped) RPCs that has existed so far in a
* single socket. Readable via sysctl, and may be reset via sysctl
* to begin recalculating.
*/
int max_dead_buffs;
/**
* @pacer_kthread: Kernel thread that transmits packets from
* throttled_rpcs in a way that limits queue buildup in the
* NIC.
*/
struct task_struct *pacer_kthread;
/**
* @pacer_exit: true means that the pacer thread should exit as
* soon as possible.
*/
bool pacer_exit;
/**
* @max_nic_queue_ns: Limits the NIC queue length: we won't queue
* up a packet for transmission if link_idle_time is this many
* nanoseconds in the future (or more). Set externally via sysctl.
*/
int max_nic_queue_ns;
/**
* @max_nic_queue_cycles: Same as max_nic_queue_ns, except in units
* of get_cycles().
*/
int max_nic_queue_cycles;
/**
* @cycles_per_kbyte: the number of cycles, as measured by get_cycles(),
* that it takes to transmit 1000 bytes on our uplink. This is actually
* a slight overestimate of the value, to ensure that we don't
* underestimate NIC queue length and queue too many packets.
*/
__u32 cycles_per_kbyte;
/**
* @verbose: Nonzero enables additional logging. Set externally via
* sysctl.
*/
int verbose;
/**
* @max_gso_size: Maximum number of bytes that will be included
* in a single output packet that Homa passes to Linux. Can be set
* externally via sysctl to lower the limit already enforced by Linux.
*/
int max_gso_size;
/**
* @gso_force_software: A non-zero value will cause Home to perform
* segmentation in software using GSO; zero means ask the NIC to
* perform TSO. Set externally via sysctl.
*/
int gso_force_software;
/**
* @hijack_tcp: Non-zero means encapsulate outgoing Homa packets
* as TCP packets (i.e. use TCP as the IP protocol). This makes TSO
* and RSS work better. Set externally via sysctl.
*/
int hijack_tcp;
/**
* @max_gro_skbs: Maximum number of socket buffers that can be
* aggregated by the GRO mechanism. Set externally via sysctl.
*/
int max_gro_skbs;
/**
* @gro_policy: An OR'ed together collection of bits that determine
* how Homa packets should be steered for SoftIRQ handling. A value
* of zero will eliminate any Homa-specific behaviors, reverting
* to the Linux defaults. Set externally via sysctl (but modifying
* it is almost certainly a bad idea; see below).
*/
int gro_policy;
/* Bits that can be specified for gro_policy. These were created for
* testing, in order to evaluate various possible policies; you almost
* certainly should not use any value other than HOMA_GRO_NORMAL.
* HOMA_GRO_SAME_CORE If isolated packets arrive (not part of
* a batch) use the GRO core for SoftIRQ also.
* HOMA_GRO_IDLE Use old mechanism for selecting an idle
* core for SoftIRQ (deprecated).
* HOMA_GRO_NEXT Always use the next core in circular
* order for SoftIRQ (deprecated).
* HOMA_GRO_GEN2 Use the new mechanism for selecting an
* idle core for SoftIRQ.
* HOMA_GRO_FAST_GRANTS Pass all grants immediately to
* homa_softirq during GRO (only if the
* core isn't overloaded).
* HOMA_GRO_SHORT_BYPASS Pass all single-packet messages directly
* to homa_softirq during GRO (only if the
* core isn't overloaded).
* HOMA_GRO_GEN3 Use the "Gen3" mechanisms for load
* balancing.
*/
#define HOMA_GRO_SAME_CORE 2
#define HOMA_GRO_IDLE 4
#define HOMA_GRO_NEXT 8
#define HOMA_GRO_GEN2 0x10
#define HOMA_GRO_FAST_GRANTS 0x20
#define HOMA_GRO_SHORT_BYPASS 0x40
#define HOMA_GRO_GEN3 0x80
#define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE|HOMA_GRO_GEN2 \
|HOMA_GRO_SHORT_BYPASS|HOMA_GRO_FAST_GRANTS)
/*
* @busy_usecs: if there has been activity on a core within the
* last @busy_usecs, it is considered to be busy and Homa will
* try to avoid scheduling other activities on the core. See
* balance.txt for more on load balancing. Set externally via sysctl.
*/
int busy_usecs;
/** @busy_cycles: Same as busy_usecs except in get_cycles() units. */
int busy_cycles;
/*
* @gro_busy_usecs: if the gap between the completion of
* homa_gro_receive and the next call to homa_gro_receive on the same
* core is less than this, then GRO on that core is considered to be
* "busy", and optimizations such as HOMA_GRO_SHORT_BYPASS will not be
* done because they risk overloading the core. Set externally via
* sysctl.
*/
int gro_busy_usecs;
/** @gro_busy_cycles: Same as busy_usecs except in get_cycles() units. */
int gro_busy_cycles;
/**
* @timer_ticks: number of times that homa_timer has been invoked
* (may wraparound, which is safe).
*/
__u32 timer_ticks;
/**
* @metrics_lock: Used to synchronize accesses to @metrics_active_opens
* and updates to @metrics.
*/
spinlock_t metrics_lock;
/*
* @metrics: a human-readable string containing recent values
* for all the Homa performance metrics, as generated by
* homa_append_metric. This string is kmalloc-ed; NULL means
* homa_append_metric has never been called.
*/
char *metrics;
/** @metrics_capacity: number of bytes available at metrics. */
size_t metrics_capacity;
/**
* @metrics_length: current length of the string in metrics,
* not including terminating NULL character.
*/
size_t metrics_length;
/**
* @metrics_active_opens: number of open struct files that
* currently exist for the metrics file in /proc.
*/
int metrics_active_opens;
/**
* @flags: a collection of bits that can be set using sysctl
* to trigger various behaviors.
*/
int flags;
/**
* @freeze_type: determines conditions under which the time trace
* should be frozen. Set externally via sysctl.
*/
enum homa_freeze_type freeze_type;
/**
* @bpage_lease_usecs: how long a core can own a bpage (microseconds)
* before its ownership can be revoked to reclaim the page.
*/
int bpage_lease_usecs;
/**
* @bpage_lease_cycles: The value of @bpage_lease_usecs in get_cycles
* units.
*/
int bpage_lease_cycles;
/**
* @next_id: Set via sysctl; causes next_outgoing_id to be set to
* this value; always reads as zero. Typically used while debugging to
* ensure that different nodes use different ranges of ids.
*/
int next_id;
/**
* @temp: the values in this array can be read and written with sysctl.
* They have no officially defined purpose, and are available for
* short-term use during testing.
*/
int temp[4];
};
/**
* struct homa_skb_info - Additional information needed by Homa for each
* outbound DATA packet. Space is allocated for this at the very end of the
* linear part of the skb.
*/
struct homa_skb_info {
/**
* @next_skb: used to link together all of the skb's for a Homa
* message (in order of offset).
*/
struct sk_buff *next_skb;
/**
* @wire_bytes: total number of bytes of network bandwidth that
* will be consumed by this packet. This includes everything,
* including additional headers added by GSO, IP header, Ethernet
* header, CRC, preamble, and inter-packet gap.
*/
int wire_bytes;
/**
* @data_bytes: total bytes of message data across all of the
* segments in this packet.
*/
int data_bytes;
/** @seg_length: maximum number of data bytes in each GSO segment. */
int seg_length;
/**
* @offset: offset within the message of the first byte of data in
* this packet.
*/
int offset;
};
/**
* homa_get_skb_info() - Return the address of Homa's private information
* for an sk_buff.
* @skb: Socket buffer whose info is needed.
*/
static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb)
{
return (struct homa_skb_info *) (skb_end_pointer(skb)
- sizeof(struct homa_skb_info));
}
/**
* homa_next_skb() - Compute address of Homa's private link field in @skb.
* @skb: Socket buffer containing private link field.
*
* Homa needs to keep a list of buffers in a message, but it can't use the
* links built into sk_buffs because Homa wants to retain its list even
* after sending the packet, and the built-in links get used during sending.
* Thus we allocate extra space at the very end of the packet's data
* area to hold a forward pointer for a list.
*/
static inline struct sk_buff **homa_next_skb(struct sk_buff *skb)
{
return (struct sk_buff **) (skb_end_pointer(skb) - sizeof(char *));
}
/**
* homa_set_doff() - Fills in the doff TCP header field for a Homa packet.
* @h: Packet header whose doff field is to be set.
* @size: Size of the "header", bytes (must be a multiple of 4). This
* information is used only for TSO; it's the number of bytes
* that should be replicated in each segment. The bytes after
* this will be distributed among segments.
*/
static inline void homa_set_doff(struct data_header *h, int size)
{
h->common.doff = size << 2;
}
/**
* homa_throttle_lock() - Acquire the throttle lock. If the lock
* isn't immediately available, record stats on the waiting time.
* @homa: Overall data about the Homa protocol implementation.
*/
static inline void homa_throttle_lock(struct homa *homa)
{
if (!spin_trylock_bh(&homa->throttle_lock))
homa_throttle_lock_slow(homa);
}
/**
* homa_throttle_unlock() - Release the throttle lock.
* @homa: Overall data about the Homa protocol implementation.
*/
static inline void homa_throttle_unlock(struct homa *homa)
{
spin_unlock_bh(&homa->throttle_lock);
}
/** skb_is_ipv6() - Return true if the packet is encapsulated with IPv6,
* false otherwise (presumably it's IPv4).
*/
static inline bool skb_is_ipv6(const struct sk_buff *skb)
{
return ipv6_hdr(skb)->version == 6;
}
/**
* Given an IPv4 address, return an equivalent IPv6 address (an IPv4-mapped
* one)
* @ip4: IPv4 address, in network byte order.
*/
static inline struct in6_addr ipv4_to_ipv6(__be32 ip4)
{
struct in6_addr ret = {};
if (ip4 == INADDR_ANY)
return in6addr_any;
ret.in6_u.u6_addr32[2] = htonl(0xffff);
ret.in6_u.u6_addr32[3] = ip4;
return ret;
}
/**
* ipv6_to_ipv4() - Given an IPv6 address produced by ipv4_to_ipv6, return
* the original IPv4 address (in network byte order).
* @ip6: IPv6 address; assumed to be a mapped IPv4 address.
*/