-- Exemplar flavor VHDL source code for a 5X5 image convolving filter -- with 8 bit coefficients and 8 bit unsigned inputs. The architecture -- accepts 5 bytes at a time, and converts the 5 bytes into 5 serial bit -- streams at 80 Mbits/second, yielding a throughput of 10 million -- convolutions/second with an ORCA 2C04-3S208 part. Total logic count -- is 70 PFU's out of a total of 100, thus giving a nominal gate count of -- 3000 gates. The speed is limited by the propagation delay of the carry -- bit in the 8 bit shifting accumulators, and the distribution delay of the -- 8 select signals (sel1, sel2, ...) which control the filter loading and -- readout on the global tristate bus that collects the filter results. -- -- Because these filters use a distributed arithmetic approach, the filter -- coefficients are entirely contained within the various lookup tables. -- In other filter design methods that implement multiplier-free filters, -- the number of adders depends on the filter coefficients. -- Copyright 1995, John McCluskey -- email: J.McCluskey@ieee.org -- This was compiled with CORE 2.1.10, using Xilinx 4000 as the target -- technology, and the resulting XNF file was retargeted to ORCA using -- ORCA Foundry 7.0 (formerly known as FPGA Foundry from Neocad). LIBRARY ieee; use ieee.std_logic_1164.all; LIBRARY exemplar ; use exemplar.exemplar_1164.all; -- the "my_stuff" package contains definitions for a sychronous write enable -- flip flop procedures. LIBRARY my_stuff; use my_stuff.my_stuff.all; entity convolver is port ( X1, X2, X3, X4, X5 : IN std_logic_vector(7 downto 0); RESULT: INOUT std_logic_vector(7 downto 0); --signed_char; RDY: OUT std_logic; -- true pulse when ready for next words CLK: IN std_logic -- clock input ); end convolver; architecture Orca of convolver is subtype byte is integer range -128 to 127; type lookup is array (natural range <>) of byte; -- this is a 32 bit shift register with 4 taps component sreg4 port ( CLK,SDIN: IN std_logic; Q8, Q16, Q24, Q32: OUT std_logic ); end component; -- this is a loadable 8 bit shifting accumulator component acc8 port ( CLK, LD, D7, D6, D5, D4, D3, D2, D1, D0: IN std_logic; Q7, Q6, Q5, Q4, Q3, Q2, Q1, Q0: OUT std_logic ); end component; -- this is a loadable 12 bit accumulator that doesn't shift -- it has an input that is a 2's complement 8 bit number component acc12l port( CLK, LD, A7, A6, A5, A4, A3, A2, A1, A0: IN std_logic; Q11, Q10, Q9, Q8, Q7, Q6, Q5, Q4, Q3, Q2, Q1, Q0: OUT std_logic ); end component; -- 5 wires to carry the serialized input bytes signal serial: std_logic_vector(4 downto 0); signal CNT, TCNT : std_logic_vector(2 downto 0); attribute use_modgen: boolean; attribute use_modgen of tcnt:signal is false; signal Y1, Y2, Y3, Y4, Y5: std_logic_vector(7 downto 0); signal int_bus, lat_bus: std_logic_vector(7 downto 0); -- internal bus signal first, second: std_logic_vector(0 to 4); -- first serial bits appear here signal answer: std_logic_vector(11 downto 0); -- final 12 bit answer signal grab, sel, sel0,sel1,sel2,sel3,sel4,sel5,sel6,sel7: std_logic; begin TCNT <= CNT + "001"; -- basic 8 cycle counter drives everything dff_v(TCNT,CLK,CNT); sel <= '1' when CNT="111" else '0'; -- generate a Johnson counter dff(sel,clk,sel0); dff(sel0,clk,sel1); dff(sel1,clk,sel2); dff(sel2,clk,sel3); dff(sel3,clk,sel4); dff(sel4,clk,sel5); dff(sel5,clk,sel6); dff(sel6,clk,sel7); dff(sel7,clk,grab); dff_enable(X1,grab,CLK,Y1); -- enable latching the new words when grab is true dff_enable(X2,grab,CLK,Y2); dff_enable(X3,grab,CLK,Y3); dff_enable(X4,grab,CLK,Y4); dff_enable(X5,grab,CLK,Y5); -- this uses up 40 flip flops dff(grab,clk,RDY); -- signal we have latched the 5 words and are ready for more. -- define a 5 wide 8 input mux using tri-states mux1: block -- with staggered delay output into the serial bus signal d0: std_logic; signal d1: std_logic_vector(0 to 1); signal d2: std_logic_vector(0 to 2); signal d3: std_logic_vector(0 to 3); signal sbus: std_logic_vector(0 to 4); begin sbus <= Y5(0) & Y4(0) & Y3(0) & Y2(0) & Y1(0) when sel1='1' else (others => 'Z'); sbus <= Y5(1) & Y4(1) & Y3(1) & Y2(1) & Y1(1) when sel2='1' else (others => 'Z'); sbus <= Y5(2) & Y4(2) & Y3(2) & Y2(2) & Y1(2) when sel3='1' else (others => 'Z'); sbus <= Y5(3) & Y4(3) & Y3(3) & Y2(3) & Y1(3) when sel4='1' else (others => 'Z'); sbus <= Y5(4) & Y4(4) & Y3(4) & Y2(4) & Y1(4) when sel5='1' else (others => 'Z'); sbus <= Y5(5) & Y4(5) & Y3(5) & Y2(5) & Y1(5) when sel6='1' else (others => 'Z'); sbus <= Y5(6) & Y4(6) & Y3(6) & Y2(6) & Y1(6) when sel7='1' else (others => 'Z'); sbus <= Y5(7) & Y4(7) & Y3(7) & Y2(7) & Y1(7) when sel0='1' else (others => 'Z'); dff_v(sbus,clk,first); -- latch the tri-state bus into "first" dff_v(first,clk,second); -- and then into "second" dff(second(0),clk,serial(0)); -- no delay dff(second(1),clk, d0); dff(d0, clk, serial(1)); -- delay by 1 bit dff(second(2),clk, d1(0)); dff(d1(0),clk,d1(1)); dff(d1(1),clk,serial(2)); -- delay by 2 bits dff(second(3),clk, d2(0)); dff_v(d2(0 to 1),clk,d2(1 to 2)); dff(d2(2),clk,serial(3)); -- delay by 3 bits dff(second(4),clk, d3(0)); dff_v(d3(0 to 2),clk,d3(1 to 3)); dff(d3(3),clk,serial(4)); -- delay by 4 bits -- this block uses up 40 tri-state buffers and 20 flip flops end block; filt0: block -- this block implements a 4 tap FIR filter -- with the assumption that the input data is non-negative -- the filter coefficents are stored in the constant lookup table -- this first filter is different, since it needs no 32 bit shift register constant lut0: lookup(0 to 15) := ( -17, 39, -3, 34, 97, -34, 8, -72, 81, -102, 32, 99, 51, -43, 7, 19 ); signal tbus, dbus, zbus: std_logic_vector(7 downto 0); begin -- the LSB hits this lookup table when sel2='1' tbus <= int2evec( lut0( evec2int(first(0 to 3)) ), 8); dff_v(tbus,clk,dbus); -- then the lookup of the LSB's hit the accumulator when sel3='1' a1: acc8 port map( CLK=> CLK, LD => sel3, D7 => dbus(7), D6 => dbus(6), D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2), D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2), Q1 => zbus(1), Q0 => zbus(0) ); -- now drive the internal tri-state bus when the answer is ready -- which is exactly the same cycle when we are loading the LSB's above int_bus <= zbus when sel3='1' else (others => 'Z'); end block; -- the filter uses 6 PLC's and 8 tri-state buffers -- this filter coefficient (tap) uses the output from second(4) -- with the resulting answer appearing at sel4 filt6: block -- this block implements a 1 tap FIR filter -- with the assumption that the input data is non-negative -- there is only 1 filter coefficient signal del: std_logic; signal tbus, dbus, zbus: std_logic_vector(7 downto 0); constant coef: byte := -34; begin tbus <= int2evec( coef, 8) when del='1' else (others => '0') ; dff_v(tbus,clk,dbus); a1: acc8 port map( CLK=> CLK, LD => sel4, D7 => dbus(7), D6 => dbus(6), D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2), D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2), Q1 => zbus(1), Q0 => zbus(0) ); -- now drive the internal tri-state bus when the answer is ready int_bus <= zbus when sel4='1' else (others => 'Z'); end block; -- the filter uses 6 PLC's and 8 tri-state buffers filt1: block -- this block implements a 4 tap FIR filter -- with the assumption that the input data is non-negative -- the filter coefficents are stored in the constant lookup table constant lut0: lookup(0 to 15) := ( -14, 60, -43, 32, 17, -4, 88, -106, 111, -33, 14, 0, 55, -66, 77, 89 ); signal qbus: std_logic_vector(3 downto 0); signal tbus, dbus, zbus: std_logic_vector(7 downto 0); begin s1: sreg4 port map(CLK => CLK, SDIN => serial(0), Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) ); tbus <= int2evec( lut0( evec2int(qbus) ), 8); dff_v(tbus,clk,dbus); -- this filter runs 1 cycle behind filt0 a1: acc8 port map( CLK=> CLK, LD => sel5, D7 => dbus(7), D6 => dbus(6), D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2), D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2), Q1 => zbus(1), Q0 => zbus(0) ); -- now drive the internal tri-state bus when the answer is ready int_bus <= zbus when sel5='1' else (others => 'Z'); end block; -- the filter uses 6 PLC's and 8 tri-state buffers filt2: block -- this block implements a 4 tap FIR filter -- with the assumption that the input data is non-negative -- the filter coefficents are stored in the constant lookup table constant lut0: lookup(0 to 15) := ( -33, 10, -43, 2, 17, -40, 8, -106, 10, -63, 14, 33, 55, -66, 7, 9 ); signal qbus: std_logic_vector(3 downto 0); signal tbus, dbus, zbus: std_logic_vector(7 downto 0); begin s1: sreg4 port map(CLK => CLK, SDIN => serial(1), Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) ); tbus <= int2evec( lut0( evec2int(qbus) ), 8); dff_v(tbus,clk,dbus); a1: acc8 port map( CLK=> CLK, LD => sel6, D7 => dbus(7), D6 => dbus(6), D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2), D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2), Q1 => zbus(1), Q0 => zbus(0) ); -- now drive the internal tri-state bus when the answer is ready int_bus <= zbus when sel6='1' else (others => 'Z'); end block; -- the filter uses 6 PLC's and 8 tri-state buffers filt3: block -- this block implements a 4 tap FIR filter -- with the assumption that the input data is non-negative -- the filter coefficents are stored in the constant lookup table constant lut0: lookup(0 to 15) := ( -1, 0, 43, 15, 7, -47, 80, -16, 11, -70, 1, 83, -102, 72, 111, 3 ); signal qbus: std_logic_vector(3 downto 0); signal tbus, dbus, zbus: std_logic_vector(7 downto 0); begin s1: sreg4 port map(CLK => CLK, SDIN => serial(2), Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) ); tbus <= int2evec( lut0( evec2int(qbus) ), 8); dff_v(tbus,clk,dbus); a1: acc8 port map( CLK=> CLK, LD => sel7, D7 => dbus(7), D6 => dbus(6), D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2), D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2), Q1 => zbus(1), Q0 => zbus(0) ); -- now drive the internal tri-state bus when the answer is ready int_bus <= zbus when sel7='1' else (others => 'Z'); end block; -- the filter uses 6 PLC's and 8 tri-state buffers filt4: block -- this block implements a 4 tap FIR filter -- with the assumption that the input data is non-negative -- the filter coefficents are stored in the constant lookup table constant lut0: lookup(0 to 15) := ( -23, 32, -1, 123, 2, 84, 8, 106, 15, -88, 44, 30, 5, -66, 17, 16 ); signal qbus: std_logic_vector(3 downto 0); signal tbus, dbus, zbus: std_logic_vector(7 downto 0); begin s1: sreg4 port map(CLK => CLK, SDIN => serial(3), Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) ); tbus <= int2evec( lut0( evec2int(qbus) ), 8); dff_v(tbus,clk,dbus); a1: acc8 port map( CLK=> CLK, LD => sel0, D7 => dbus(7), D6 => dbus(6), D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2), D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2), Q1 => zbus(1), Q0 => zbus(0) ); -- now drive the internal tri-state bus when the answer is ready int_bus <= zbus when sel0='1' else (others => 'Z'); end block; -- the filter uses 6 PLC's and 8 tri-state buffers filt5: block -- this block implements a 4 tap FIR filter -- with the assumption that the input data is non-negative -- the filter coefficents are stored in the constant lookup table constant lut0: lookup(0 to 15) := ( 14, 101, -3, 15, 17, -4, 88, -106, 10, -33, 1, 0, 5, -66, 7, -99 ); signal qbus: std_logic_vector(3 downto 0); signal tbus, dbus, zbus: std_logic_vector(7 downto 0); begin s1: sreg4 port map(CLK => CLK, SDIN => serial(4), Q8 => qbus(3), Q16 => qbus(2), Q24 => qbus(1), Q32 => qbus(0) ); tbus <= int2evec( lut0( evec2int(qbus) ), 8); dff_v(tbus,clk,dbus); a1: acc8 port map( CLK=> CLK, LD => sel1, D7 => dbus(7), D6 => dbus(6), D5 => dbus(5), D4 => dbus(4), D3 => dbus(3), D2 => dbus(2), D1 => dbus(1), D0 => dbus(0), Q7 => zbus(7), Q6 => zbus(6), Q5 => zbus(5), Q4 => zbus(4), Q3 => zbus(3), Q2 => zbus(2), Q1 => zbus(1), Q0 => zbus(0) ); -- now drive the internal tri-state bus when the answer is ready int_bus <= zbus when sel1='1' else (others => 'Z'); end block; -- the filter uses 6 PLC's and 8 tri-state buffers -- latch the tri-state bus before accumulating the final results dff_v(int_bus,clk,lat_bus); -- the first valid result comes out at sel4='1' -- now the final result is calculated in a 12 bit accumulator, of which we will -- only keep the top 8 bits. acc1: acc12l port map ( CLK => CLK, LD => sel4, A7 => lat_bus(7), A6 => lat_bus(6), A5 => lat_bus(5), A4 => lat_bus(4), A3 => lat_bus(3), A2 => lat_bus(2), A1 => lat_bus(1), A0 => lat_bus(0), Q11 => answer(11), Q10 => answer(10), Q9 => answer(9), Q8 => answer(8), Q7 => answer(7), Q6 => answer(6), Q5 => answer(5), Q4 => answer(4), Q3 => answer(3), Q2 => answer(2), Q1 => answer(1), Q0 => answer(0) ); dff_enable(answer(7 downto 4),sel2,clk,result(3 downto 0)); -- output low nibble dff_enable(answer(11 downto 8),sel3,clk,result(7 downto 4)); -- output high nibble end Orca;