7flash/tb_tictactoe_predictor.v

## tb_tictactoe_predictor.v
// Testbench for tictactoe_predictor
// Simulates inputs, dumps waves for visualization

`timescale 1ns / 1ps

module tb_tictactoe_predictor;
    reg  [31:0] input_bits;
    wire [3:0]  output_pos;

    // Instantiate the DUT (Design Under Test)
    tictactoe_predictor dut (
        .input_bits(input_bits),
        .output_pos(output_pos)
    );

    // Test cases: Example inputs as 32-bit vectors
    // Your example: "0000 0010 0100" -> bits 0000_0010_0100_padded (LSB-padded for sim)
    // Assume pos0 MSB: input_bits = 32'b00000000_00000000_00000000_00010100  (binary for 0000 0010 0100 reversed if needed)
    initial begin
        $dumpfile("tictactoe_sim.vcd");  // VCD for waveform viewer
        $dumpvars(0, tb_tictactoe_predictor);  // Dump all vars

        // Test 1: Empty board (should predict pos4 = 0100)
        input_bits = 32'b00000000000000000000000000010100;  // 0100 padded
        #10;  // Delay for combo logic
        $display("Input: %b, Output pos: %b (dec: %d)", input_bits, output_pos, output_pos);
        if (output_pos != 4'd4) $error("Test 1 failed!");

        // Test 2: Your example "0000 0010 0100" -> expect 1000 (8)
        input_bits = 32'b00000000000000000000010010000000;  // Flatten: 000000100100 padded (adjust bit order if MSB/LSB)
        #10;
        $display("Input: %b, Output pos: %b (dec: %d)", input_bits, output_pos, output_pos);
        if (output_pos != 4'd8) $error("Test 2 failed!");

        // Test 3: Another state
        input_bits = 32'b00000000000000000000000000000000;  // All zero
        #10;
        $display("Input: %b, Output pos: %b (dec: %d)", input_bits, output_pos, output_pos);

        $finish;  // End sim
    end

endmodule

## tictactoe_predictor.v
// Tic-Tac-Toe O-Move Predictor: Verilog Hardware Inference Engine
// Simplified MLP: 32-bit input -> FC1 (32x64) -> ReLU -> FC2 (64x9) -> Argmax -> 4-bit binary output
// Assumptions:
// - Fixed-point arithmetic: 8-bit signed integers (Q7.0, no fractions for simplicity; scale weights accordingly)
// - Combinatorial logic (no clock; for FPGA/ASIC, add clk/rst for pipelining)
// - Weights hardcoded (example values; replace with trained int-scaled ones, e.g., *128 and round)
// - ReLU: max(0, x)
// - Argmax: Finds index of max logit (0-8)
// - Input: 32-bit vector [31:0] (MSB first, padded bits)
// - Output: 4-bit binary position (e.g., 1000 for pos8)
// Synthesize with tools like Vivado; simulate with ModelSim

module tictactoe_predictor (
    input  [31:0] input_bits,      // 32-bit input vector (bits as 0/1, MSB pos0)
    output [3:0]  output_pos       // 4-bit binary position (0-8)
);

    // Fixed-point typedef (8-bit signed)
    wire signed [7:0] hidden [0:63];       // 64 hidden neurons
    wire signed [7:0] logits [0:8];        // 9 output logits
    wire signed [7:0] max_logit;
    reg  [3:0]  argmax_idx;

    // Hardcoded weights (example 8-bit int values; train/scale your floats to [-128,127])
    // FC1: 32x64 weights (simplified subset; full would be wire arrays)
    wire signed [7:0] fc1_w [0:31][0:63];  // Dense array; init in real design
    initial begin
      fc1_w[0][0] = 8'd5;  fc1_w[0][1] = -8'd12;  // TODO: read real weights into here
    end
    wire signed [7:0] fc1_b [0:63] = {64{8'd0}};  // Biases zero

    // FC2: 64x9 weights
    wire signed [7:0] fc2_w [0:63][0:8];
    initial begin
        // Example
        fc2_w[0][0] = 8'd34; fc2_w[0][1] = -8'd21;  // ... fill 64*9=576
    end
    wire signed [7:0] fc2_b [0:8] = {9{8'd0}};

    // Compute FC1: hidden[h] = sum_i (input_bits[i] * fc1_w[i][h]) + fc1_b[h]
    // (input_bits as 0/1, so effectively sum of weights where bit=1)
    genvar i, h;
    generate
        for (h = 0; h < 64; h = h + 1) begin : fc1_layer
            wire signed [15:0] sum_temp = 16'd0;  // Accumulator (16-bit to avoid overflow)
            for (i = 0; i < 32; i = i + 1) begin : fc1_mac
                assign sum_temp = sum_temp + (input_bits[i] ? {{8{fc1_w[i][h][7]}}, fc1_w[i][h]} : 16'd0);  // Sign-extend if bit=1
            end
            assign hidden[h] = sum_temp[14:7] + fc1_b[h];  // Shift/scale + bias (approx saturation)
        end
    endgenerate

    // ReLU: hidden[h] = max(0, hidden[h])
    generate
        for (h = 0; h < 64; h = h + 1) begin : relu_layer
            assign hidden[h] = (hidden[h] > 0) ? hidden[h] : 8'd0;
        end
    endgenerate

    // Compute FC2: logits[p] = sum_h (hidden[h] * fc2_w[h][p]) + fc2_b[p]
    generate
        for (genvar p = 0; p < 9; p = p + 1) begin : fc2_layer
            wire signed [15:0] sum_temp2 = 16'd0;
            for (h = 0; h < 64; h = h + 1) begin : fc2_mac
                assign sum_temp2 = sum_temp2 + (hidden[h] * {{8{fc2_w[h][p][7]}}, fc2_w[h][p]});
            end
            assign logits[p] = sum_temp2[14:7] + fc2_b[p];
        end
    endgenerate

    // Argmax: Find index of max logit
    always @(*) begin
        argmax_idx = 4'd0;
        max_logit = logits[0];
        for (genvar p = 1; p < 9; p = p + 1) begin
            if (logits[p] > max_logit) begin
                max_logit = logits[p];
                argmax_idx = p[3:0];
            end
        end
    end

    assign output_pos = argmax_idx;  // Direct 4-bit output (pos 0-8 in binary)

endmodule
	// Testbench for tictactoe_predictor
	// Simulates inputs, dumps waves for visualization

	`timescale 1ns / 1ps

	module tb_tictactoe_predictor;
	reg [31:0] input_bits;
	wire [3:0] output_pos;

	// Instantiate the DUT (Design Under Test)
	tictactoe_predictor dut (
	.input_bits(input_bits),
	.output_pos(output_pos)
	);

	// Test cases: Example inputs as 32-bit vectors
	// Your example: "0000 0010 0100" -> bits 0000_0010_0100_padded (LSB-padded for sim)
	// Assume pos0 MSB: input_bits = 32'b00000000_00000000_00000000_00010100 (binary for 0000 0010 0100 reversed if needed)
	initial begin
	$dumpfile("tictactoe_sim.vcd"); // VCD for waveform viewer
	$dumpvars(0, tb_tictactoe_predictor); // Dump all vars

	// Test 1: Empty board (should predict pos4 = 0100)
	input_bits = 32'b00000000000000000000000000010100; // 0100 padded
	#10; // Delay for combo logic
	$display("Input: %b, Output pos: %b (dec: %d)", input_bits, output_pos, output_pos);
	if (output_pos != 4'd4) $error("Test 1 failed!");

	// Test 2: Your example "0000 0010 0100" -> expect 1000 (8)
	input_bits = 32'b00000000000000000000010010000000; // Flatten: 000000100100 padded (adjust bit order if MSB/LSB)
	#10;
	$display("Input: %b, Output pos: %b (dec: %d)", input_bits, output_pos, output_pos);
	if (output_pos != 4'd8) $error("Test 2 failed!");

	// Test 3: Another state
	input_bits = 32'b00000000000000000000000000000000; // All zero
	#10;
	$display("Input: %b, Output pos: %b (dec: %d)", input_bits, output_pos, output_pos);

	$finish; // End sim
	end

	endmodule
	// Tic-Tac-Toe O-Move Predictor: Verilog Hardware Inference Engine
	// Simplified MLP: 32-bit input -> FC1 (32x64) -> ReLU -> FC2 (64x9) -> Argmax -> 4-bit binary output
	// Assumptions:
	// - Fixed-point arithmetic: 8-bit signed integers (Q7.0, no fractions for simplicity; scale weights accordingly)
	// - Combinatorial logic (no clock; for FPGA/ASIC, add clk/rst for pipelining)
	// - Weights hardcoded (example values; replace with trained int-scaled ones, e.g., *128 and round)
	// - ReLU: max(0, x)
	// - Argmax: Finds index of max logit (0-8)
	// - Input: 32-bit vector [31:0] (MSB first, padded bits)
	// - Output: 4-bit binary position (e.g., 1000 for pos8)
	// Synthesize with tools like Vivado; simulate with ModelSim

	module tictactoe_predictor (
	input [31:0] input_bits, // 32-bit input vector (bits as 0/1, MSB pos0)
	output [3:0] output_pos // 4-bit binary position (0-8)
	);

	// Fixed-point typedef (8-bit signed)
	wire signed [7:0] hidden [0:63]; // 64 hidden neurons
	wire signed [7:0] logits [0:8]; // 9 output logits
	wire signed [7:0] max_logit;
	reg [3:0] argmax_idx;

	// Hardcoded weights (example 8-bit int values; train/scale your floats to [-128,127])
	// FC1: 32x64 weights (simplified subset; full would be wire arrays)
	wire signed [7:0] fc1_w [0:31][0:63]; // Dense array; init in real design
	initial begin
	fc1_w[0][0] = 8'd5; fc1_w[0][1] = -8'd12; // TODO: read real weights into here
	end
	wire signed [7:0] fc1_b [0:63] = {64{8'd0}}; // Biases zero

	// FC2: 64x9 weights
	wire signed [7:0] fc2_w [0:63][0:8];
	initial begin
	// Example
	fc2_w[0][0] = 8'd34; fc2_w[0][1] = -8'd21; // ... fill 64*9=576
	end
	wire signed [7:0] fc2_b [0:8] = {9{8'd0}};

	// Compute FC1: hidden[h] = sum_i (input_bits[i] * fc1_w[i][h]) + fc1_b[h]
	// (input_bits as 0/1, so effectively sum of weights where bit=1)
	genvar i, h;
	generate
	for (h = 0; h < 64; h = h + 1) begin : fc1_layer
	wire signed [15:0] sum_temp = 16'd0; // Accumulator (16-bit to avoid overflow)
	for (i = 0; i < 32; i = i + 1) begin : fc1_mac
	assign sum_temp = sum_temp + (input_bits[i] ? {{8{fc1_w[i][h][7]}}, fc1_w[i][h]} : 16'd0); // Sign-extend if bit=1
	end
	assign hidden[h] = sum_temp[14:7] + fc1_b[h]; // Shift/scale + bias (approx saturation)
	end
	endgenerate

	// ReLU: hidden[h] = max(0, hidden[h])
	generate
	for (h = 0; h < 64; h = h + 1) begin : relu_layer
	assign hidden[h] = (hidden[h] > 0) ? hidden[h] : 8'd0;
	end
	endgenerate

	// Compute FC2: logits[p] = sum_h (hidden[h] * fc2_w[h][p]) + fc2_b[p]
	generate
	for (genvar p = 0; p < 9; p = p + 1) begin : fc2_layer
	wire signed [15:0] sum_temp2 = 16'd0;
	for (h = 0; h < 64; h = h + 1) begin : fc2_mac
	assign sum_temp2 = sum_temp2 + (hidden[h] * {{8{fc2_w[h][p][7]}}, fc2_w[h][p]});
	end
	assign logits[p] = sum_temp2[14:7] + fc2_b[p];
	end
	endgenerate

	// Argmax: Find index of max logit
	always @(*) begin
	argmax_idx = 4'd0;
	max_logit = logits[0];
	for (genvar p = 1; p < 9; p = p + 1) begin
	if (logits[p] > max_logit) begin
	max_logit = logits[p];
	argmax_idx = p[3:0];
	end
	end
	end

	assign output_pos = argmax_idx; // Direct 4-bit output (pos 0-8 in binary)

	endmodule