Core RTL and digital design form the foundation of any VLSI engineer's skillset. This blog covers essential building blocks that every hardware designer must master - from arithmetic units to data manipulation circuits.
Difficulty: Beginner | Key Learning: Basic arithmetic & logic operations
An Arithmetic Logic Unit (ALU) is the computational heart of any processor. It performs arithmetic operations (add, subtract, multiply) and logical operations (AND, OR, XOR, NOT).
module alu_16bit (
input wire [15:0] a,
input wire [15:0] b,
input wire [3:0] opcode,
output reg [15:0] result,
output wire zero,
output wire negative,
output wire carry,
output wire overflow
);
reg carry_out;
reg overflow_flag;
// ALU Operations
localparam ADD = 4'b0000;
localparam SUB = 4'b0001;
localparam AND = 4'b0010;
localparam OR = 4'b0011;
localparam XOR = 4'b0100;
localparam NOT = 4'b0101;
localparam SHL = 4'b0110; // Shift Left
localparam SHR = 4'b0111; // Shift Right
localparam INC = 4'b1000; // Increment
localparam DEC = 4'b1001; // Decrement
always @(*) begin
carry_out = 1'b0;
overflow_flag = 1'b0;
case (opcode)
ADD: begin
{carry_out, result} = a + b;
overflow_flag = (a[15] == b[15]) && (result[15] != a[15]);
end
SUB: begin
{carry_out, result} = a - b;
overflow_flag = (a[15] != b[15]) && (result[15] != a[15]);
end
AND: result = a & b;
OR: result = a | b;
XOR: result = a ^ b;
NOT: result = ~a;
SHL: result = a << b[3:0];
SHR: result = a >> b[3:0];
INC: {carry_out, result} = a + 1;
DEC: {carry_out, result} = a - 1;
default: result = 16'h0000;
endcase
end
// Flag assignments
assign zero = (result == 16'h0000);
assign negative = result[15];
assign carry = carry_out;
assign overflow = overflow_flag;
endmodule
module tb_alu_16bit;
reg [15:0] a, b;
reg [3:0] opcode;
wire [15:0] result;
wire zero, negative, carry, overflow;
alu_16bit uut (
.a(a), .b(b), .opcode(opcode),
.result(result), .zero(zero),
.negative(negative), .carry(carry), .overflow(overflow)
);
initial begin
$monitor("Time=%0t Op=%b A=%h B=%h Result=%h Z=%b N=%b C=%b V=%b",
$time, opcode, a, b, result, zero, negative, carry, overflow);
// Test ADD
a = 16'h0005; b = 16'h0003; opcode = 4'b0000; #10;
// Test SUB
a = 16'h0010; b = 16'h0005; opcode = 4'b0001; #10;
// Test AND
a = 16'hFF00; b = 16'h0F0F; opcode = 4'b0010; #10;
// Test overflow
a = 16'h7FFF; b = 16'h0001; opcode = 4'b0000; #10;
$finish;
end
endmodule
Difficulty: Intermediate | Key Learning: Pipeline stages, throughput vs latency
A pipelined ALU breaks operations into stages, allowing multiple operations to execute simultaneously. This increases throughput at the cost of latency.
module pipelined_alu (
input wire clk,
input wire rst_n,
input wire [15:0] a,
input wire [15:0] b,
input wire [3:0] opcode,
input wire valid_in,
output reg [15:0] result,
output reg valid_out
);
// Pipeline Stage 1: Input Registration
reg [15:0] a_s1, b_s1;
reg [3:0] opcode_s1;
reg valid_s1;
// Pipeline Stage 2: Execution
reg [15:0] result_s2;
reg valid_s2;
// Stage 1: Register inputs
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
a_s1 <= 16'h0;
b_s1 <= 16'h0;
opcode_s1 <= 4'h0;
valid_s1 <= 1'b0;
end else begin
a_s1 <= a;
b_s1 <= b;
opcode_s1 <= opcode;
valid_s1 <= valid_in;
end
end
// Stage 2: Execute operation
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
result_s2 <= 16'h0;
valid_s2 <= 1'b0;
end else begin
valid_s2 <= valid_s1;
case (opcode_s1)
4'b0000: result_s2 <= a_s1 + b_s1;
4'b0001: result_s2 <= a_s1 - b_s1;
4'b0010: result_s2 <= a_s1 & b_s1;
4'b0011: result_s2 <= a_s1 | b_s1;
4'b0100: result_s2 <= a_s1 ^ b_s1;
default: result_s2 <= 16'h0;
endcase
end
end
// Stage 3: Output registration
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
result <= 16'h0;
valid_out <= 1'b0;
end else begin
result <= result_s2;
valid_out <= valid_s2;
end
end
endmodule
Difficulty: Advanced | Key Learning: Parallel reduction, CSA
Wallace Tree is a fast multiplication technique that reduces partial products in parallel using carry-save adders, achieving O(log n) reduction stages.
module wallace_tree_4bit (
input wire [3:0] a,
input wire [3:0] b,
output wire [7:0] product
);
// Generate partial products
wire [3:0] pp0 = a & {4{b[0]}};
wire [3:0] pp1 = a & {4{b[1]}};
wire [3:0] pp2 = a & {4{b[2]}};
wire [3:0] pp3 = a & {4{b[3]}};
// Align partial products
wire [7:0] pp0_aligned = {4'b0, pp0};
wire [7:0] pp1_aligned = {3'b0, pp1, 1'b0};
wire [7:0] pp2_aligned = {2'b0, pp2, 2'b0};
wire [7:0] pp3_aligned = {1'b0, pp3, 3'b0};
// CSA Layer 1: Reduce 4 partial products to 2
wire [7:0] sum1, carry1, sum2, carry2;
// First CSA
assign sum1 = pp0_aligned ^ pp1_aligned ^ pp2_aligned;
assign carry1 = ((pp0_aligned & pp1_aligned) |
(pp1_aligned & pp2_aligned) |
(pp0_aligned & pp2_aligned)) << 1;
// Second level
assign sum2 = sum1 ^ carry1 ^ pp3_aligned;
assign carry2 = ((sum1 & carry1) |
(carry1 & pp3_aligned) |
(sum1 & pp3_aligned)) << 1;
// Final addition
assign product = sum2 + carry2;
endmodule
Difficulty: Intermediate | Key Learning: Signed multiplication, encoding
Booth's algorithm reduces the number of partial products by encoding multiplier bits, handling signed multiplication efficiently.
| Current Bit | Previous Bit | Action |
|---|---|---|
| 0 | 0 | No operation |
| 0 | 1 | Add multiplicand |
| 1 | 0 | Subtract multiplicand |
| 1 | 1 | No operation |
module booth_multiplier_8bit (
input wire signed [7:0] multiplicand,
input wire signed [7:0] multiplier,
output reg signed [15:0] product
);
reg signed [15:0] a; // Accumulator
reg signed [8:0] q; // Multiplier with Q-1
reg signed [7:0] m; // Multiplicand
integer i;
always @(*) begin
a = 16'b0;
q = {multiplier, 1'b0}; // Q-1 = 0 initially
m = multiplicand;
for (i = 0; i < 8; i = i + 1) begin
case (q[1:0])
2'b01: a = a + {m, 8'b0}; // Add
2'b10: a = a - {m, 8'b0}; // Subtract
default: ; // No operation
endcase
// Arithmetic right shift {A, Q}
{a, q} = {a[15], a, q[8:1]};
end
product = {a[7:0], q[8:1]};
end
endmodule
Difficulty: Intermediate | Key Learning: Carry prediction, G and P signals
CLA eliminates ripple carry delay by computing all carries in parallel using Generate (G) and Propagate (P) signals.
module carry_lookahead_adder_4bit (
input wire [3:0] a,
input wire [3:0] b,
input wire cin,
output wire [3:0] sum,
output wire cout
);
wire [3:0] g, p; // Generate and Propagate
wire [4:0] c; // Carries
// Generate and Propagate
assign g = a & b;
assign p = a ^ b;
// Carry Look-Ahead Logic
assign c[0] = cin;
assign c[1] = g[0] | (p[0] & c[0]);
assign c[2] = g[1] | (p[1] & g[0]) | (p[1] & p[0] & c[0]);
assign c[3] = g[2] | (p[2] & g[1]) | (p[2] & p[1] & g[0]) |
(p[2] & p[1] & p[0] & c[0]);
assign c[4] = g[3] | (p[3] & g[2]) | (p[3] & p[2] & g[1]) |
(p[3] & p[2] & p[1] & g[0]) | (p[3] & p[2] & p[1] & p[0] & c[0]);
// Sum
assign sum = p ^ c[3:0];
assign cout = c[4];
endmodule
Difficulty: Beginner | Key Learning: MUX-based shifting
A barrel shifter can shift data by any number of positions in a single clock cycle using multiplexer layers.
module barrel_shifter_8bit (
input wire [7:0] data_in,
input wire [2:0] shift_amt,
input wire shift_dir, // 0=left, 1=right
input wire arithmetic, // 1=arithmetic shift (sign extend)
output reg [7:0] data_out
);
always @(*) begin
if (shift_dir == 1'b0) begin
// Left shift
data_out = data_in << shift_amt;
end else begin
// Right shift
if (arithmetic)
data_out = $signed(data_in) >>> shift_amt;
else
data_out = data_in >> shift_amt;
end
end
endmodule
// Alternative: Multiplexer-based barrel shifter
module barrel_shifter_mux (
input wire [7:0] data_in,
input wire [2:0] shift_amt,
output wire [7:0] data_out
);
wire [7:0] stage0, stage1, stage2;
// Stage 0: Shift by 0 or 1
assign stage0 = shift_amt[0] ? {data_in[6:0], 1'b0} : data_in;
// Stage 1: Shift by 0 or 2
assign stage1 = shift_amt[1] ? {stage0[5:0], 2'b0} : stage0;
// Stage 2: Shift by 0 or 4
assign stage2 = shift_amt[2] ? {stage1[3:0], 4'b0} : stage1;
assign data_out = stage2;
endmodule
Difficulty: Intermediate | Key Learning: Memory design, read/write ports
A register file is a collection of registers with read and write ports, commonly used in processor designs.
module register_file #(
parameter DATA_WIDTH = 32,
parameter ADDR_WIDTH = 5,
parameter NUM_REGS = 32
)(
input wire clk,
input wire rst_n,
// Write Port
input wire wr_en,
input wire [ADDR_WIDTH-1:0] wr_addr,
input wire [DATA_WIDTH-1:0] wr_data,
// Read Port 1
input wire [ADDR_WIDTH-1:0] rd_addr1,
output wire [DATA_WIDTH-1:0] rd_data1,
// Read Port 2
input wire [ADDR_WIDTH-1:0] rd_addr2,
output wire [DATA_WIDTH-1:0] rd_data2
);
// Register array
reg [DATA_WIDTH-1:0] registers [0:NUM_REGS-1];
integer i;
// Write operation (synchronous)
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
for (i = 0; i < NUM_REGS; i = i + 1)
registers[i] <= {DATA_WIDTH{1'b0}};
end else if (wr_en && wr_addr != 0) begin
// Register 0 is hardwired to zero (RISC-V convention)
registers[wr_addr] <= wr_data;
end
end
// Read operations (asynchronous)
assign rd_data1 = (rd_addr1 == 0) ? {DATA_WIDTH{1'b0}} : registers[rd_addr1];
assign rd_data2 = (rd_addr2 == 0) ? {DATA_WIDTH{1'b0}} : registers[rd_addr2];
endmodule
module tb_register_file;
parameter DATA_WIDTH = 32;
parameter ADDR_WIDTH = 5;
reg clk, rst_n;
reg wr_en;
reg [ADDR_WIDTH-1:0] wr_addr, rd_addr1, rd_addr2;
reg [DATA_WIDTH-1:0] wr_data;
wire [DATA_WIDTH-1:0] rd_data1, rd_data2;
register_file uut (.*);
always #5 clk = ~clk;
initial begin
clk = 0; rst_n = 0; wr_en = 0;
#20 rst_n = 1;
// Write to register 5
@(posedge clk);
wr_en = 1; wr_addr = 5; wr_data = 32'hDEADBEEF;
@(posedge clk);
wr_en = 0;
// Read from register 5
rd_addr1 = 5; rd_addr2 = 0;
#1;
$display("R5 = %h, R0 = %h", rd_data1, rd_data2);
#50 $finish;
end
endmodule
| Design | Complexity | Key Learning |
|---|---|---|
| 16-bit ALU | Beginner | Basic arithmetic & logic operations |
| Pipelined ALU | Intermediate | Pipeline stages, throughput vs latency |
| Wallace Tree | Advanced | Parallel reduction, CSA |
| Booth Multiplier | Intermediate | Signed multiplication, encoding |
| CLA | Intermediate | Carry prediction, G and P signals |
| Barrel Shifter | Beginner | MUX-based shifting |
| Register File | Intermediate | Memory design, read/write ports |
Continue your VLSI learning journey with the complete blog series: