import swda # this script requires C Potts switchboard corpus reader and his annotated data def findTurn(utterances,i): # this function takes a list of utterances and an index. # it computes the turn the utterance lies in. # backchannels and fragments are not considered turn-changes. # this works as follows: # we go as far backwards and forwards as we can without changing speaker # (unless the other speaker is uttering a backchannel, then go on anyway) # sometimes, i is itself a non-turn-taking utterance # then we move forward until we find a proper turn tmp = i while True: try: if utterances[tmp].damsl_act_tag() in ['b','x']: tmp += 1 else: break except IndexError: break # i = tmp utt = utterances[i] speaker = utt.caller start = i end = i+1 length = 1 # go forward while True: try: if utterances[end].caller == speaker: end += 1 length += 1 elif utterances[end].damsl_act_tag() in ['b','x']: end += 1 else: break except IndexError: break # go backward while True: try: if utterances[start-1].caller == speaker: start -= 1 length += 1 elif utterances[start-1].damsl_act_tag() in ['b','x']: start -= 1 else: break except IndexError: break # these two ensure that start and end-1 are actually # spoken by the speaker of the turn while True: try: if not utterances[start].caller == speaker: start += 1 else: break except IndexError: break while True: try: if not utterances[end-1].caller == speaker: end -= 1 else: break except IndexError: break # note that utterances[end] is the first utterance *not* part of the turn. # also note that length is *not* end-start, because length only counts the # utterances by the turn-speaker (i.e. it does not count backchannels). return (start,end,length) def getWords(utt): # this takes all actual words (no annotations etc.) # from an utterance text = utt.text.split(" ") # tokenize the utterance text2 = [] for word in text: word = word.strip() # Remove non-words: if not word in ['#','/','[',']','}','--',',','-/','+','-','((','))']: if not (word.startswith('{') or word.startswith('<') or word.endswith('>') or word.startswith('(')): if word: text2.append(word) text2 = [word.lower() for word in text2] text2 = [word.strip().strip(',.-#') for word in text2] return text2 def printSegment(utterances,i,j,mark): # this can be used to neatly print a segment (debugging purposes) # it prints the segment from i to j and # puts a marking arrow at the index "mark" for k in range(i,j): #print utterances[k].damsl_act_tag(),utterances[k].caller, utterances[k].text if k == mark: print "-->", utterances[k].caller+":", utterances[k].text else: print utterances[k].caller+":", utterances[k].text def num_turns(transcript,speaker): # finds how many turns in total are spoken by speaker n = 0 i = 0 # this iterates the corpus *by turn* while i < len(transcript.utterances): turn = findTurn(transcript.utterances,i) i = turn[1] #jump to the end of the turn n += 1 return n def base_frequency(transcript,speaker,feature): n = 0 i = 0 # this iterates the corpus *by turn* again while i < len(transcript.utterances): turn = findTurn(transcript.utterances,i) i = turn[1] found = False # check the whole turn for an occurrence of the marker for j in range(turn[0],turn[1]): utt = transcript.utterances[j] if utt.caller == speaker: words = getWords(utt) for word in words: if word in feature: found = True break if found: #if we found one, we are done with this turn n += 1 break return 1.0*n # multiplying by 1.0 makes this return a float def coordination(transcript,speaker,feature): # this crawls the corpus in turn-tuples (t1,t2) # where t2 is spoken by "speaker" # it counts how often "feature" appears in both # t1 and t2 simultaneously # this function might not be the most clever way to do it. # turns can be very long, so you might be overestimating # alignments, in particular when the features are very # common words. n = 0 i = 0 # this iterates the corpus *by turn* while i < len(transcript.utterances): turn = findTurn(transcript.utterances,i) found1 = False #marks if marker in previous turn found2 = False #marks if marker in current turn for j in range(turn[0],turn[1]): #check the whole turn for the occurrence of the marker utt = transcript.utterances[j] # this turn is the one of the other speaker if not (utt.caller == speaker): words = getWords(utt) for word in words: if word in feature: found1 = True break if found1: break # this might have been the last turn if not turn[1] < len(transcript.utterances): break # find the next turn, i.e., find the turn at the end of the current one turn = findTurn(transcript.utterances,turn[1]) for j in range(turn[0],turn[1]): #check the whole turn for the occurrence of the marker utt = transcript.utterances[j] if utt.caller == speaker: words = getWords(utt) for word in words: if word in feature: found2 = True break if found2: break if found1 and found2: n += 1 i = turn[1] return 1.0*n #return a float def alignment(transcript,speaker1,speaker2,feature): # follows Echoes of Power, equation (1) # this is C^m(b,a) for m = feature, b = speaker2, a = speaker1 # i.e. the coordination of speaker2 towards speaker1 # feel free to modify this to other alignment measures # the population in the probababilistic space is the set of all # tuples (t1,t2) of turns, where t2 is spoken by speaker2 # if you make modifications where the population is something else # take great care to modify the math here accordingly! # this is the size of the population n2 = num_turns(transcript,speaker2) # if the first turn is by speaker2, then there is no tuple # with that turn in the population if transcript.utterances[0].caller == speaker2: n2 -= 1 # compute how often the speaker of t1 uses feature # this is E^m_u1 base1 = base_frequency(transcript,speaker1,feature)/n2 # compute how often the speaker2 uses the feature in reply to speaker1 # this is E^m_u2->u1 (there are only 2 speakers, so everything counts) # (I mean that the "in reply to" doesn't mean anything in this setting) base2 = base_frequency(transcript,speaker2,feature)/n2 # compute the conditional probability P(E^m_u2->u1|E^m_u1) # conditional probability: P(A|B) = P(A * B)/P(B) # i.e. the probability that A and B appear together over the # base probability that B appears # the function "coordination" computes how often the feature # is used by *both* speakers in the same tuple con = coordination(transcript,speaker2,feature)/(n2*base1) # alignment = how often the speaker uses the feature conditioned on the # previous speaker using it, minus how often he uses it generally return con - base2 corpus = swda.CorpusReader('swda') # if you use multiple features it is much more clever to read this from a file # this is for demonstration purposes only feature_pp = ["i","you","he","they","she","we","who","them","him","me","her","us","himself","themselves","someone","herself","anyone","everyone","whom","myself","each other","yourself","no one","somebody","nobody","everybody","anybody","his","mine","ourselves","yours","hers","no-one","ours","theirs","his","their","her","my","your","our","one another"] # simply iterates over all transcripts in the corpus and computes alignment for transcript in corpus.iter_transcripts(display_progress=False): print alignment(transcript,"A","B",feature_pp) # sanity check: # if this is not something between -1 and 1, you've done something wrong # you probably should do some statistical analysis here ## this just crawls the whole corpus: # for transcript in corpus.iter_transcripts(display_progress=False): # for i in range(len(transcript.utterances)): # utt = transcript.utterances[i] # pre = transcript.utterances[i-1] # # tag = utt.damsl_act_tag()[:2] # if tag in ["ar"] and not (pre.damsl_act_tag().startswith("q")) and not (utt.caller == pre.caller): # if utt.text.strip().lower().startswith("no"): # continue # for j in [i-2,i-1,i,i+1]: # try: # c = transcript.utterances[j] # print c.caller, c.text.strip() # except: # pass # print ""