# ====================================================================== # COSP 2012 - Raquel Fernandez, ILLC, UvA # # Implementation by Katrin Erk (www.katrinerk.com) with minor modifications. # The original code is part of the course materials of Katrin's course # Introduction to Computational Linguistics, Spring 2012 # http://www.katrinerk.com/courses/introduction_to_computational_linguistics_spring_2012/ics12_schedule/python-code-creating-a-vector-space-representation # ====================================================================== import nltk import math print "reading Brown corpus..." brown_words = list(nltk.corpus.brown.words()) print "computing space..." context_size = 10 space = nltk.ConditionalFreqDist() for index in range(len(brown_words)): # current word current = brown_words[ index ] # context before the current word: count each item # but no preceding context for index 0 (the beginning of the list # of words) if index > 0: # don't start from a cxword_index < 0 in case index < context_size for cxword_index in range(max(index - context_size, 0), index): cxword = brown_words[ cxword_index ] # In a ConditionalFreqDist, if 'current' is not a condition yet, # then accessing it creates a new empty FreqDist for 'current' # The FreqDist method inc() increments the count for the given item by one. space[ current ].inc(cxword) # context after the current word: count each item # but no succeeding context for the last item (index len(brown_words - 1)) if index < len(brown_words) - 1: # don't run until a cxword_index > len(brown_words) in case # index + context_size > len(brown_words) for cxword_index in range(index + 1, min(index + context_size + 1, len(brown_words))): cxword = brown_words[ cxword_index ] # In a ConditionalFreqDist, if 'current' is not a condition yet, # then accessing it creates a new empty FreqDist for 'current' # The FreqDist method inc() increments the count for the given item by one. space[ current ].inc(cxword) print "... space computed." print "Some examples from the model..." print "co-occurrence frequency counts for 'election':" for cxword, count in space[ 'election' ].items()[:50]: print cxword, ":", count print "co-occurrence frequency counts for 'water':" for cxword, count in space[ 'water' ].items()[:50]: print cxword, ":", count print "Matrix with vectors for target words 'water' and 'election' showing only 3 dimensions\n" matrix = space.tabulate(conditions=['water', 'election'], samples=['be','vote','drinking']) print matrix # cosine similarity between word1 and word2: # # sum_w space[word1][w] * space[word2][w] # ----------------------------------------------------------- # sqrt(sum_w space[word1][w]^2) * sqrt(sum_w space[word2][w]^2) # def cosine(space, word1, word2): denominator = math.sqrt(sum([count*count for count in space[word1].values()])) * math.sqrt(sum([count * count for count in space[word2].values()])) numerator = sum([ space[word1][w] * space[word2][w] for w in space[word1].keys() ]) return float(numerator) / float(denominator) # some word similarities print "Some word similarities:" print "cosine similarity score (fire,water) =", cosine(space, "fire", "water") print "cosine similarity score (election,vote) =", cosine(space, "election", "vote") print "cosine similarity score (the,happy) =", cosine(space, "the", "happy") print "cosine similarity score (good,bad) =", cosine(space, "good", "bad")