# ======================================================================
# COSP 2012 - Raquel Fernandez, ILLC, UvA
#
# Implementation by Katrin Erk (www.katrinerk.com) with minor modifications.
# The original code is part of the course materials of Katrin's course
# Introduction to Computational Linguistics, Spring 2012
# http://www.katrinerk.com/courses/introduction_to_computational_linguistics_spring_2012/ics12_schedule/python-code-creating-a-vector-space-representation
# ======================================================================

import nltk
import math

print "reading Brown corpus..."

brown_words = list(nltk.corpus.brown.words())

print "computing space..."

context_size = 10
space = nltk.ConditionalFreqDist()

for index in range(len(brown_words)):

    # current word
    current = brown_words[ index ]

    # context before the current word: count each item
    # but no preceding context for index 0 (the beginning of the list
    # of words)
    if index > 0:
        # don't start from a cxword_index < 0 in case index < context_size

        for cxword_index in range(max(index - context_size, 0), index):
            cxword = brown_words[ cxword_index ]

            # In a ConditionalFreqDist, if 'current' is not a condition yet,
            # then accessing it creates a new empty FreqDist for 'current'
            # The FreqDist method inc() increments the count for the given item by one.
            space[ current ].inc(cxword)

    # context after the current word: count each item
    # but no succeeding context for the last item (index len(brown_words - 1))
    if index < len(brown_words) - 1:

        # don't run until a cxword_index > len(brown_words) in case
        # index + context_size > len(brown_words)
        for cxword_index in range(index + 1, min(index + context_size + 1, len(brown_words))):

            cxword = brown_words[ cxword_index ]

            # In a ConditionalFreqDist, if 'current' is not a condition yet,
            # then accessing it creates a new empty FreqDist for 'current'
            # The FreqDist method inc() increments the count for the given item by one.
            space[ current ].inc(cxword)

print "... space computed."
print "Some examples from the model..."

print "co-occurrence frequency counts for 'election':"
for cxword, count in space[ 'election' ].items()[:50]:
    print cxword, ":", count

print "co-occurrence frequency counts for 'water':"
for cxword, count in space[ 'water' ].items()[:50]:
    print cxword, ":", count

print "Matrix with vectors for target words 'water' and 'election' showing only 3 dimensions\n"
matrix = space.tabulate(conditions=['water', 'election'],
                        samples=['be','vote','drinking'])

print matrix


# cosine similarity between word1 and word2:
#
# sum_w space[word1][w] * space[word2][w]
# -----------------------------------------------------------
# sqrt(sum_w space[word1][w]^2) * sqrt(sum_w space[word2][w]^2)
#
def cosine(space, word1, word2):
    denominator = math.sqrt(sum([count*count for count in space[word1].values()])) * math.sqrt(sum([count * count for count in space[word2].values()]))
    numerator = sum([ space[word1][w] * space[word2][w] for w in space[word1].keys() ])
    return float(numerator) / float(denominator)

# some word similarities
print "Some word similarities:"
print "cosine similarity score (fire,water) =", cosine(space, "fire", "water")
print "cosine similarity score (election,vote) =", cosine(space, "election", "vote")
print "cosine similarity score (the,happy) =", cosine(space, "the", "happy")
print "cosine similarity score (good,bad) =", cosine(space, "good", "bad")