Cosine similarity measures the angle between two vectors and can be used to perform similarity between text strings. In the following code, the two input strings are vectorized and the similarity is returned as a floating point value between 0 and 1.


SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
-- =============================================
-- Author: Adam Maus (http://adammaus.com)
-- Create date: 2012-06-20
-- Description: Cosine Similarity: http://en.wikipedia.org/wiki/Cosine_similarity
-- Determines the angle between two vectors
-- =============================================
CREATE FUNCTION [dbo].[cosine_distance]
(
@s nvarchar(4000), @t nvarchar(4000)
)
RETURNS float
AS
BEGIN
-- Create an array of letter frequencies using the unicode of @s and @t
DECLARE @sLet table(letter int, freq int) -- Pretend these are vectors
DECLARE @tLet table(letter int, freq int)
DECLARE @i int, @j int
DECLARE @c int -- The current character
-- Create the arrays
SET @i = 1
WHILE @i <= LEN(@s)
BEGIN
-- Use the UNICODE values for the character but not necessary
SET @c = UNICODE(SUBSTRING(@s, @i, 1)) + 1
-- Determine whether this character is in the vector already
SELECT @j = COUNT(letter) FROM @sLet WHERE letter = @c
IF @j > 0 BEGIN
UPDATE @sLet SET freq += 1 WHERE letter = @c
END ELSE BEGIN
INSERT INTO @sLet VALUES (@c, 1)
END
-- We want to keep @tLet consistent with @sLet so insert a 0 for that letter
SELECT @j = COUNT(letter) FROM @tLet WHERE letter = @c
IF @j = 0 BEGIN
INSERT INTO @tLet VALUES (@c, 0)
END

SELECT @i += 1
END
SET @i = 1
WHILE @i <= LEN(@t)
BEGIN
SET @c = UNICODE(SUBSTRING(@t, @i, 1)) + 1
SELECT @j = COUNT(letter) FROM @tLet WHERE letter = @c
IF @j > 0 BEGIN
UPDATE @tLet SET freq += 1 WHERE letter = @c
END ELSE BEGIN
INSERT INTO @tLet VALUES (@c, 1)
END

-- We want to keep @sLet consistent with @tLet so insert a 0 for that letter
SELECT @j = COUNT(letter) FROM @sLet WHERE letter = @c
IF @j = 0 BEGIN
INSERT INTO @sLet VALUES (@c, 0)
END

SELECT @i += 1
END
-- Compute the similarity
-- Declare the numerator for the similarity
DECLARE @numer float
SET @numer = 0
SELECT @numer += s.freq * t.freq FROM @sLet s LEFT JOIN @tLet t ON s.letter = t.letter
-- Declare the norm values and calculate the denominator for the similarity
DECLARE @sNorm int, @tNorm int
SET @sNorm = 0
SET @tNorm = 0
SELECT @sNorm += freq * freq FROM @sLet
SELECT @tNorm += freq * freq FROM @tLet
DECLARE @denom float
SET @denom = SQRT(@sNorm) * SQRT(@tNorm)
RETURN (@numer) / (@denom+1) -- The +1 eliminates the possibility 0 = @denom
END

To “install” this script using Microsoft SQL Server Management Studio, go to your database, and open Programmability > Functions and right-click on Scalar-valued Functions to add a new function.

Tagged with:
 

8 Responses to Cosine Similarity in MS SQL

  1. Pat says:

    The query completed with errors:

    Msg 102, Level 15, State 1, Procedure cosine_distance, Line 21
    Incorrect syntax near ‘0’.
    Msg 156, Level 15, State 1, Procedure cosine_distance, Line 23
    Incorrect syntax near the keyword ‘ELSE’.
    Msg 156, Level 15, State 1, Procedure cosine_distance, Line 34
    Incorrect syntax near the keyword ‘SET’.
    Msg 102, Level 15, State 1, Procedure cosine_distance, Line 35
    Incorrect syntax near ‘0’.
    Msg 156, Level 15, State 1, Procedure cosine_distance, Line 37
    Incorrect syntax near the keyword ‘ELSE’.
    Msg 156, Level 15, State 1, Procedure cosine_distance, Line 51
    Incorrect syntax near the keyword ‘DECLARE’.
    Msg 102, Level 15, State 1, Procedure cosine_distance, Line 64
    Incorrect syntax near ‘END’.

    Is it possible that the page does not render all the symbols in your program?

    • Adam says:

      Hi Pat,

      Thank you for testing the script. It looks like when I copied it out of of SQL Server Management Studio, it replaced some of the code. I fixed the problems. Could you try it again and see if you run into any issues.

      Thank you for trying out the code!
      Adam

  2. Gregory says:

    What if I want to compute similarity based on words, not on letters?

  3. Gregory says:

    I modified the function, now it calculated cosine similarity based on words. Thanks a lot for providing me with an idea, it saved me a lot of time. :)

    • Adam says:

      Hi Gregory,

      That is good news. I am curious how you calculated the similarity. Did you end up splitting the text into words and forming a bag-of-words vector?

      Thank you,
      Adam

      • Gregory says:

        CREATE FUNCTION fCOSINE_SIMILARITY
        (
        @TERM1 NVARCHAR(2000),
        @TERM2 NVARCHAR(2000),
        @TYPE INT
        )
        RETURNS FLOAT
        AS
        BEGIN
        DECLARE @TERM NVARCHAR(4000) = @TERM1 + ‘ ‘ + @TERM2

        DECLARE @ZBOROVI TABLE (ZBOR NVARCHAR(100))
        DECLARE @ZBOROVI1 TABLE (ZBOR NVARCHAR(100))
        DECLARE @ZBOROVI2 TABLE (ZBOR NVARCHAR(100))

        DECLARE @VECTOR TABLE (ZBOR NVARCHAR(100))
        DECLARE @VECTOR1 TABLE (ZBOR NVARCHAR(100), CESTOTA FLOAT)
        DECLARE @VECTOR2 TABLE (ZBOR NVARCHAR(100), CESTOTA FLOAT)

        ;WITH ZBOROVI(START_POS, END_POS)
        AS
        (
        SELECT 0 START_POS, CHARINDEX(‘ ‘, @TERM) END_POS — ‘ ‘

        UNION ALL
        SELECT END_POS + 1, CHARINDEX(‘ ‘, @TERM, END_POS + 1)
        FROM ZBOROVI
        WHERE END_POS > 0
        )
        INSERT INTO @ZBOROVI
        SELECT SUBSTRING(@TERM, START_POS, COALESCE(NULLIF(END_POS, 0), LEN(@TERM)+1)-START_POS) AS ZBOR
        FROM ZBOROVI

        ;WITH ZBOROVI(START_POS, END_POS)
        AS
        (
        SELECT 0 START_POS, CHARINDEX(‘ ‘, @TERM1) END_POS — ‘ ‘ – DELIMITER,

        UNION ALL
        SELECT END_POS + 1, CHARINDEX(‘ ‘, @TERM1, END_POS + 1)
        FROM ZBOROVI
        WHERE END_POS > 0
        )
        INSERT INTO @ZBOROVI1
        SELECT SUBSTRING(@TERM1, START_POS, COALESCE(NULLIF(END_POS, 0), LEN(@TERM1)+1)-START_POS) AS ZBOR
        FROM ZBOROVI

        ;WITH ZBOROVI(START_POS, END_POS)
        AS
        (
        SELECT 0 START_POS, CHARINDEX(‘ ‘, @TERM2) END_POS — ‘ ‘ – DELIMITER,

        UNION ALL
        SELECT END_POS + 1, CHARINDEX(‘ ‘, @TERM2, END_POS + 1)
        FROM ZBOROVI
        WHERE END_POS > 0
        )
        INSERT INTO @ZBOROVI2
        SELECT SUBSTRING(@TERM2, START_POS, COALESCE(NULLIF(END_POS, 0), LEN(@TERM2)+1)-START_POS) AS ZBOR
        FROM ZBOROVI

        INSERT INTO @VECTOR
        SELECT MIN(ZBOR) AS ZBOR
        FROM @ZBOROVI
        GROUP BY ZBOR

        INSERT INTO @VECTOR1
        SELECT MIN(ZBOR) AS ZBOR, CAST(COUNT(ZBOR) AS FLOAT) AS CESTOTA
        FROM @ZBOROVI1
        GROUP BY ZBOR

        INSERT INTO @VECTOR2
        SELECT MIN(ZBOR) AS ZBOR, CAST(COUNT(ZBOR) AS FLOAT) AS CESTOTA
        FROM @ZBOROVI2
        GROUP BY ZBOR

        INSERT INTO @VECTOR1
        SELECT V.ZBOR, CAST(0 AS FLOAT) CESTOTA
        FROM @VECTOR2 V
        LEFT JOIN @VECTOR1 V1 ON V.ZBOR=V1.ZBOR
        WHERE V1.ZBOR IS NULL

        INSERT INTO @VECTOR2
        SELECT V.ZBOR, CAST(0 AS FLOAT) CESTOTA
        FROM @VECTOR1 V
        LEFT JOIN @VECTOR2 V2 ON V.ZBOR=V2.ZBOR
        WHERE V2.ZBOR IS NULL

        DECLARE @NUM_VERSES INT
        SELECT @NUM_VERSES = COUNT(*) FROM VERSES

        IF @TYPE=1 –TERM FREQ
        BEGIN
        UPDATE @VECTOR1
        SET
        CESTOTA = V1.CESTOTA * LOG(@NUM_VERSES/A.CESTOTA)
        FROM @VECTOR1 V1
        JOIN ALL_WORDS A ON UPPER(V1.ZBOR)=A.ZBOR

        UPDATE @VECTOR2
        SET
        CESTOTA = V2.CESTOTA * LOG(@NUM_VERSES/A.CESTOTA)
        FROM @VECTOR2 V2
        JOIN ALL_WORDS A ON UPPER(V2.ZBOR)=A.ZBOR
        END

        DECLARE @NOMINAL FLOAT
        SET @NOMINAL = 0

        SELECT @NOMINAL += V1.CESTOTA * V2.CESTOTA
        FROM @VECTOR1 V1
        JOIN @VECTOR2 V2 ON V1.ZBOR = V2.ZBOR

        DECLARE @V1_NORM INT = 0
        DECLARE @V2_NORM INT = 0

        SELECT @V1_NORM += CESTOTA * CESTOTA FROM @VECTOR1
        SELECT @V2_NORM += CESTOTA * CESTOTA FROM @VECTOR2

        DECLARE @DENOM FLOAT
        SET @DENOM = SQRT(@V1_NORM) * SQRT(@V2_NORM)

        IF @DENOM = 0 SET @DENOM = 1
        RETURN @NOMINAL / @DENOM
        END

      • Gregory says:

        Basically it is the same as your code, instead of characters, I fill the vectors with words.
        Plus, I added extra code if tf*idf preffered

        • Adam says:

          Thank you for expanding the code, I think it will be a lot more useful with what you added! I will try it out as soon as I get a chance. :)

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>