Skip to content

Commit

Permalink
address feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
aagarwalTT committed Jan 27, 2025
1 parent a159d42 commit 178a802
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 40 deletions.
2 changes: 1 addition & 1 deletion tests/scripts/t3000/run_t3000_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ run_t3000_ttfabric_tests() {
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter=ControlPlaneFixture.*T3k*
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 64 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 2048 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 65 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16

# Record the end time
end_time=$(date +%s)
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/tg/run_tg_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ run_tg_tests() {
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter=ControlPlaneFixture.*TG*
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 64 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 2048 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 65 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16

elif [[ "$1" == "llama3-70b" ]]; then
run_tg_llama3.1-70b_tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ void kernel_main() {
test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED;
test_results[PQ_TEST_MISC_INDEX] = 0xff000000;

if constexpr ((ASYNC_WR == test_command) || (test_command == ASYNC_WR_ATOMIC_INC)) {
if constexpr (ASYNC_WR & test_command) {
uint32_t packet_rnd_seed;
uint64_t curr_packet_words, curr_payload_words, processed_packet_words_src;
uint32_t max_packet_size_mask, temp;
Expand Down Expand Up @@ -108,7 +108,7 @@ void kernel_main() {
uint32_t packet_index = 1;

// if fixed notification address, wait for all the packets
if constexpr ((test_command == ASYNC_WR_ATOMIC_INC) && fixed_async_wr_notif_addr) {
if constexpr ((test_command & ATOMIC_INC) && fixed_async_wr_notif_addr) {
uint64_t temp_words = 0, temp_packets = 0;
uint32_t temp_seed = packet_rnd_seed, temp_poll_val;
while (temp_words < total_data_words) {
Expand Down Expand Up @@ -151,7 +151,7 @@ void kernel_main() {
start_val = packet_rnd_seed & PAYLOAD_MASK;

// get the value and addr to poll on
if constexpr (test_command == ASYNC_WR_ATOMIC_INC) {
if constexpr (test_command & ATOMIC_INC) {
// poll on the first word in the payload
poll_addr = read_addr;
if constexpr (fixed_async_wr_notif_addr) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ bool reset_notif_addr = true;

uint32_t time_seed;

// generates packets with random size and payload on the input side
template <bool do_atomic_inc>
// generates packets with random size and payload on the input sideß
inline bool test_buffer_handler_async_wr() {
if (input_queue_state.all_packets_done()) {
return true;
Expand Down Expand Up @@ -133,8 +132,9 @@ inline bool test_buffer_handler_async_wr() {
packet_header.routing.packet_size_bytes = input_queue_state.curr_packet_size_words * PACKET_WORD_SIZE_BYTES;
packet_header.routing.dst_mesh_id = dest_device >> 16;
packet_header.routing.dst_dev_id = dest_device & 0xFFFF;
if constexpr (do_atomic_inc) {
packet_header.session.command = ASYNC_WR_ATOMIC_INC;
packet_header.session.command = ASYNC_WR;
if constexpr (test_command & ATOMIC_INC) {
packet_header.session.command |= ATOMIC_INC;
packet_header.packet_parameters.async_wr_atomic_parameters.noc_xy = noc_offset;
packet_header.packet_parameters.async_wr_atomic_parameters.increment = atomic_increment;
if constexpr (fixed_async_wr_notif_addr) {
Expand All @@ -143,8 +143,6 @@ inline bool test_buffer_handler_async_wr() {
packet_header.packet_parameters.async_wr_atomic_parameters.l1_offset = target_address;
reset_notif_addr = true;
}
} else {
packet_header.session.command = ASYNC_WR;
}
packet_header.session.target_offset_l = target_address;
packet_header.session.target_offset_h = noc_offset;
Expand Down Expand Up @@ -193,7 +191,7 @@ inline bool test_buffer_handler_async_wr() {
(input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining - PACKET_HEADER_SIZE_WORDS);
fill_packet_data(reinterpret_cast<tt_l1_ptr uint32_t*>(byte_wr_addr), num_words, start_val);
}
if constexpr (do_atomic_inc) {
if constexpr (test_command & ATOMIC_INC) {
if (reset_notif_addr) {
tt_l1_ptr uint32_t* addr = reinterpret_cast<tt_l1_ptr uint32_t*>(byte_wr_addr);
*addr = time_seed + input_queue_state.get_num_packets();
Expand Down Expand Up @@ -347,14 +345,12 @@ inline bool test_buffer_handler_fvcc() {
}

bool test_buffer_handler() {
if constexpr (test_command == ASYNC_WR) {
return test_buffer_handler_async_wr<false>();
if constexpr (test_command & ASYNC_WR) {
return test_buffer_handler_async_wr();
} else if constexpr (test_command == ATOMIC_INC) {
return test_buffer_handler_atomic_inc();
} else if constexpr (test_command == ASYNC_WR_RESP) {
return test_buffer_handler_fvcc();
} else if constexpr (test_command == ASYNC_WR_ATOMIC_INC) {
return test_buffer_handler_async_wr<true>();
}

return true;
Expand All @@ -374,7 +370,7 @@ void kernel_main() {
gk_interface_addr_l = get_arg_val<uint32_t>(increment_arg_idx(rt_args_idx));
gk_interface_addr_h = get_arg_val<uint32_t>(increment_arg_idx(rt_args_idx));

if constexpr ((ASYNC_WR == test_command) || (ASYNC_WR_ATOMIC_INC == test_command)) {
if constexpr (ASYNC_WR & test_command) {
base_target_address = get_arg_val<uint32_t>(increment_arg_idx(rt_args_idx));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -628,15 +628,21 @@ typedef struct test_traffic {
tx_device->gk_noc_offset, // 8: gk_message_addr_h
};

if (ASYNC_WR == fabric_command || ASYNC_WR_ATOMIC_INC == fabric_command) {
if (ASYNC_WR & fabric_command) {
runtime_args.push_back(tx_to_rx_address_map[i]);
}

// zero out the signal address
tt::llrt::write_hex_vec_to_core(
tx_device->device_handle->id(), tx_physical_cores[i], zero_buf, tx_signal_address);

log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y);
log_info(
LogTest,
"run traffic_gen_tx at logical: x={},y={}; physical: x={},y={}",
core.x,
core.y,
tx_physical_cores[i].x,
tx_physical_cores[i].y);
auto kernel = tt_metal::CreateKernel(
tx_device->program_handle,
"tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp",
Expand All @@ -661,7 +667,7 @@ typedef struct test_traffic {
rx_buf_size // 2: space available in L1
};

if (ASYNC_WR == fabric_command || ASYNC_WR_ATOMIC_INC == fabric_command) {
if (ASYNC_WR & fabric_command) {
// push the src endpoint IDs
for (auto j : rx_to_tx_map[i]) {
runtime_args.push_back(tx_device->get_endpoint_id(tx_logical_cores[j]));
Expand All @@ -680,7 +686,13 @@ typedef struct test_traffic {
tt::llrt::write_hex_vec_to_core(
rx_device->device_handle->id(), rx_physical_cores[i], zero_buf, test_results_address);

log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y);
log_info(
LogTest,
"run traffic_gen_rx at logical: x={},y={}; physical: x={},y={}",
core.x,
core.y,
rx_physical_cores[i].x,
rx_physical_cores[i].y);
auto kernel = tt_metal::CreateKernel(
rx_device->program_handle,
"tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx.cpp",
Expand Down
7 changes: 3 additions & 4 deletions tt_fabric/hw/inc/tt_fabric.h
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,7 @@ typedef struct fvc_producer_state {
uint32_t words_processed = 0;
if (packet_is_for_local_chip()) {
if (current_packet_header.routing.flags == FORWARD) {
if (current_packet_header.session.command == ASYNC_WR ||
current_packet_header.session.command == ASYNC_WR_ATOMIC_INC) {
if (current_packet_header.session.command & ASYNC_WR) {
if (packet_in_progress == 0) {
packet_dest = ((uint64_t)current_packet_header.session.target_offset_h << 32) |
current_packet_header.session.target_offset_l;
Expand All @@ -574,8 +573,8 @@ typedef struct fvc_producer_state {
if (packet_words_remaining) {
words_processed = issue_async_write();
} else {
// for ASYNC_WR_ATOMIC_INC issue the atomic inc before invalidating the current packet
if (current_packet_header.session.command == ASYNC_WR_ATOMIC_INC) {
// for fused command issue the atomic inc before invalidating the current packet
if (current_packet_header.session.command & ATOMIC_INC) {
uint64_t noc_addr =
((uint64_t)current_packet_header.packet_parameters.async_wr_atomic_parameters.noc_xy
<< 32) |
Expand Down
27 changes: 12 additions & 15 deletions tt_fabric/hw/inc/tt_fabric_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,17 @@ constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = (NOC_MAX_BURST_WORDS * NOC_WORD_
constexpr uint32_t DEFAULT_MAX_ETH_SEND_WORDS = 2 * 1024;
constexpr uint32_t FVC_SYNC_THRESHOLD = 256;

enum SessionCommand : uint32_t {
ASYNC_WR = (0x1 << 0),
ASYNC_WR_RESP = (0x1 << 1),
ASYNC_RD = (0x1 << 2),
ASYNC_RD_RESP = (0x1 << 3),
DSOCKET_WR = (0x1 << 4),
SSOCKET_WR = (0x1 << 5),
ATOMIC_INC = (0x1 << 6),
ATOMIC_READ_INC = (0x1 << 7),
SOCKET_OPEN = (0x1 << 8),
SOCKET_CLOSE = (0x1 << 9),
SOCKET_CONNECT = (0x1 << 10),
ASYNC_WR_ATOMIC_INC = (0x1 << 11),
};
#define ASYNC_WR (0x1 << 0)
#define ASYNC_WR_RESP (0x1 << 1)
#define ASYNC_RD (0x1 << 2)
#define ASYNC_RD_RESP (0x1 << 3)
#define DSOCKET_WR (0x1 << 4)
#define SSOCKET_WR (0x1 << 5)
#define ATOMIC_INC (0x1 << 6)
#define ATOMIC_READ_INC (0x1 << 7)
#define SOCKET_OPEN (0x1 << 8)
#define SOCKET_CLOSE (0x1 << 9)
#define SOCKET_CONNECT (0x1 << 10)

#define INVALID 0x0
#define DATA 0x1
Expand All @@ -61,7 +58,7 @@ typedef struct _tt_routing {
static_assert(sizeof(tt_routing) == 16);

typedef struct _tt_session {
SessionCommand command;
uint32_t command;
uint32_t target_offset_l; // RDMA address
uint32_t target_offset_h;
uint32_t ack_offset_l; // fabric client local address for session command acknowledgement.
Expand Down

0 comments on commit 178a802

Please sign in to comment.