35 from __future__
import division
38 def compute_idf(corpus, keyword):
43 raw = open(file).read()
44 tokens = nltk.word_tokenize(raw)
45 total_occur = total_occur + len(tokens)
46 keyword_occur = keyword_occur + len([w
for w
in tokens
if w == keyword])
47 return math.log(total_occur / (keyword_occur))
51 def parents(keyword, prevlevelsynsets):
53 for syn
in prevlevelsynsets:
54 if type(syn)
is nltk.corpus.reader.wordnet.Synset:
55 syndef_tokens = set(nltk.word_tokenize(syn.definition))
56 if keyword
in syndef_tokens:
57 parents = parents + [syn]
65 def best_matching_synset(doc_tokens, synsets):
69 for synset
in synsets:
70 def_tokens = set(nltk.word_tokenize(synset.definition))
71 intersection = def_tokens.intersection(doc_tokens)
77 if len(intersection) > maxmatch:
78 maxmatch = len(intersection)
84 def get_context(query, documents):
85 file1 = open(documents[0])
86 file_contents = file1.read()
87 file_tokens = nltk.word_tokenize(file_contents)
89 first_occur = file_tokens.index(query)
93 for i
in file_tokens[first_occur-5:first_occur+5]:
94 context = context +
' ' + i
98 def get_jaccard_coefficient(refanswer, candidanswer):
99 total = len(set(shingles(refanswer) + shingles(candidanswer)))
100 intersect = len(set(shingles(refanswer)).intersection(set(shingles(candidanswer))))
101 return (intersect + 1) / (total + 1)
104 def shingles(phrase):
105 return bigrams(nltk.word_tokenize(phrase))
111 corpus = [
'/media/disk/CMI-related/ThesisDemo-english-test1.txt',
'/media/disk/CMI-related/ThesisDemo-english-test2.txt',
'/media/disk/CMI-related/ThesisDemo-english-test3.txt',
'/media/disk/CMI-related/ThesisDemo-carrace-test1.txt',
'/media/disk/CMI-related/ThesisDemo-carrace-test2.txt',
'/media/disk/CMI-related/ThesisDemo-carrace-test3.txt',
'/media/disk/CMI-related/ThesisDemo-datamining-test1.txt',
'/media/disk/CMI-related/ThesisDemo-datamining-test2.txt',
'/media/disk/CMI-related/ThesisDemo-datamining-test3.txt',
'/media/disk/CMI-related/ThesisDemo-datamining-test4.txt',
'/media/disk/CMI-related/ThesisDemo-datamining-test5.txt',
'/media/disk/CMI-related/ThesisDemo-graphtheory-test1.txt',
'/media/disk/CMI-related/ThesisDemo-graphtheory-test2.txt',
'/media/disk/CMI-related/ThesisDemo-graphtheory-test3.txt',
'/media/disk/CMI-related/ThesisDemo-fermatslasttheorem-test16.txt',
'/media/disk/CMI-related/ThesisDemo-fermatslasttheorem-test17.txt',
'/media/disk/CMI-related/ThesisDemo-car-test1.txt',
'ThesisDemo-newsmodipranab-test1.txt',
'ThesisDemo-newsmodipranab-test2.txt']
115 files = [
'ThesisDemo-english-test1.txt',
'ThesisDemo-english-test2.txt',
'ThesisDemo-english-test3.txt',
'ThesisDemo-carrace-test1.txt',
'ThesisDemo-carrace-test2.txt',
'ThesisDemo-carrace-test3.txt',
'ThesisDemo-datamining-test1.txt',
'ThesisDemo-datamining-test2.txt',
'ThesisDemo-datamining-test3.txt',
'ThesisDemo-datamining-test4.txt',
'ThesisDemo-datamining-test5.txt',
'ThesisDemo-graphtheory-test1.txt',
'ThesisDemo-graphtheory-test2.txt',
'ThesisDemo-graphtheory-test3.txt',
'ThesisDemo-fermatslasttheorem-test16.txt',
'ThesisDemo-fermatslasttheorem-test17.txt']
119 #---------------------------------------------------------------------------------
120 #2.Compute intrinsic merit (either using linear or quadratic overlap)
121 #---------------------------------------------------------------------------------
123 for filestr in files:
124 filestrtok = filestr.split('-')
125 outputfile = 'Output-' + filestrtok[1] + '-' + filestrtok[2]
126 output = open(outputfile, 'w')
127 file1 = open(filestr)
129 doc1 = nltk.word_tokenize(raw1)
130 from nltk.book import *
131 from nltk.corpus import stopwords
132 fdist1 = FreqDist(doc1)
133 stopwords = nltk.corpus.stopwords.words('english')
134 stopwords = stopwords + ['may', 'The', 'the', 'In', 'in','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','
U','V','W','X','Y','Z']
135 puncts = ['.', '"', ',', '{', '}', '+', '-', '*', '/', '%', '&', '(', ')', '[', ']', '=', '@', '#', ':', '|', ';','\'s']
136 #at present tfidf filter is not applied
137 freqterms1 = [w for w in fdist1.keys() if w not in stopwords and w not in puncts and (fdist1.freq(w) * compute_idf(corpus, w))]
140 nodewithmaxparents = ''
144 first_convergence_level = 1
145 tokensofthislevel = []
147 convergingparents = []
148 tokensofprevlevel = []
149 prevlevelsynsets = []
155 from nltk.corpus import wordnet as wn
157 #recurse down to required depth and update intrinsic merit score
158 #relatedness is either sum(overlaps) or sum((overlapping_parents)*(overlaps)^2) also called convergence factor
159 while current_level < 3:
160 #crucial - gather nodes which converge/overlap (have more than 1 parent)
161 if current_level > 1:
162 convergingterms = [w for w in freqterms1 if len(parents(w,prevlevelsynsets)) > 1]
163 for kw in freqterms1:
164 convergingparents = convergingparents + ([w for w in parents(kw, prevlevelsynsets) if len(parents(kw, prevlevelsynsets)) > 1])
165 for kw in freqterms1:
166 noofparents = len(parents(kw, prevlevelsynsets))
167 if noofparents > maxparents:
168 maxparents = noofparents
169 nodewithmaxparents = kw
170 output.write('converging terms(terms with more than 1 parent):\n ')
171 #pickle.dump(convergingterms,output)
173 output.write('converging parents :\n')
174 #pickle.dump(convergingparents,output)
176 for keyword in freqterms1:
177 #WSD - invokes Lesk's algorithm adapted to recursive gloss overlap- best_matching_synset()
178 output.write('===============================================\n')
179 output.write('keyword : ' + keyword)
181 #disamb_synset = best_matching_synset(set(doc1), wn.synsets(keyword))
182 disamb_synset = best_matching_synset(freqterms1, wn.synsets(keyword))
183 prevlevelsynsets = prevlevelsynsets + [disamb_synset]
184 output.write('prevlevelsynsets:\n')
185 #pickle.dump(prevlevelsynsets, output)
187 output.write('matching synset:\n')
188 #pickle.dump( disamb_synset,output)
190 if len(wn.synsets(keyword)) != 0:
191 disamb_synset_def = disamb_synset.definition
192 tokens = nltk.word_tokenize(disamb_synset_def)
193 fdist_tokens = FreqDist(tokens)
194 #at present frequency filter is not applied
195 #if keyword in convergingterms:
196 tokensofthislevel = tokensofthislevel + ([w for w in fdist_tokens.keys() if w not in stopwords and w not in puncts and fdist_tokens.freq(w)])
197 output.write('At level:\n')
198 output.write(str(current_level))
200 output.write('tokens grasped at this level:\n')
201 #pickle.dump(tokensofthislevel, output)
203 listcount = len(tokensofthislevel)
204 setcount = len(set(tokensofthislevel))
205 overlap = listcount-setcount
206 if overlap > 0 and iter == 0 :
207 first_convergence_level = current_level
209 #choose between two relatedness/convergence criteria :-
210 #1) simple linear overlap or 2) zipf distributed quadratic overlap
211 #relatedness = relatedness + len(convergingparents)*overlap
212 relatedness = relatedness + overlap + len(convergingparents)
213 #relatedness = relatedness + ((len(convergingparents)*overlap*overlap) + 1)
214 #find out common tokens of this and previous level so that same token does not get grasped again -
215 #relatedness must be increased since repetition of keywords in two successive levels is a sign of
216 #interrelatedness(a backedge from child-of-one-of-siblings to one-of-siblings). Remove vertices and edges #corresponding to common tokens
217 commontokens = set(tokensofthislevel).intersection(set(tokensofprevlevel))
218 tokensofthislevel = set(tokensofthislevel).difference(commontokens)
219 relatedness = relatedness + len(commontokens)
220 output.write('removing tokens already grasped:\n')
221 #pickle.dump(commontokens,output)
223 output.write('Relatedness:\n')
224 output.write(str(relatedness))
226 #decrease the vertices count to address common tokens removed above - edges should remain same since they
227 #would just point elsewhere
228 vertices = vertices + setcount - len(commontokens)
229 output.write('Vertices:\n')
230 output.write(str(vertices))
232 edges = edges + listcount
233 output.write('Edges:\n')
234 output.write(str(edges))
236 current_level = current_level + 1
237 freqterms1 = set(tokensofthislevel)
238 tokensofprevlevel = tokensofthislevel
239 tokensofthislevel = []
241 intrinsic_merit = vertices*edges*relatedness / first_convergence_level
242 output.write('Intrinsic merit of this document is:\n')
243 output.write(str(intrinsic_merit))
245 output.write('Node with maximum parents (and hence the most likely class of document) is:\n')
246 output.write(nodewithmaxparents)
249 #--------------------------------
250 #3. Interview Algorithm
251 #--------------------------------
253 references = ['ThesisDemo-carrace-test1.txt']
254 candidate = ['ThesisDemo-carrace-test2.txt']
256 stopwords = nltk.corpus.stopwords.words('english')
257 stopwords = stopwords + ['may', 'The', 'the', 'In', 'in','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','
R','S','T','
U','V','W','X','Y','Z']
258 puncts = ['.', '"', ',', '{', '}', '+', '-', '*', '/', '%', '&', '(', ')', '[', ']', '=', '@', '#', ':', '|', ';','\'s']
259 from nltk.book import *
260 for filestr in references:
261 file1 = open(filestr)
262 filecontents = file1.read()
263 filetokens = nltk.word_tokenize(filecontents)
264 freqdist = FreqDist(filetokens)
265 queries = queries + [w for w in freqdist.keys() if w not in stopwords and w not in puncts and freqdist.freq(w) * compute_idf(corpus, w) > 0.1]
267 print 'Most important tokens/ngrams = queries :'
273 ref_qandamap[q] = get_context(q, references)
275 candid_file = open(candidate[0])
276 candid_file_contents = candid_file.read()
277 candid_tokens = nltk.word_tokenize(candid_file_contents)
278 candid_text = nltk.Text(candid_tokens)
280 #print '--------------------------------'
281 #print 'concordance for :' + q
282 #candid_text.concordance(q)
283 #print '--------------------------------'
284 candid_qandamap[q] = get_context(q, candidate)
289 print candid_qandamap
291 #compute jaccard coefficient score for each question and answer
296 scores.append(get_jaccard_coefficient(ref_qandamap[q], candid_qandamap[q]))
301 sumscore = sumscore + x
303 print 'Interview score:'
306 #--------------------------------------------------------------------------------------------------------------------
307 #4.Compute Value Addition applying edit distance between definition-graph(reference) and definition-graph(candidate)
308 #at present limited to 1 reference which will be updated after interview algorithm finishes if necessary
309 #--------------------------------------------------------------------------------------------------------------------
310 references = ['ThesisDemo-newsmodipranab-test1.txt']
311 candidate = ['ThesisDemo-newsmodipranab-test2.txt']
313 ref_file = open(references[0])
314 candid_file = open(candidate[0])
315 ref_file_contents = ref_file.read()
316 candid_file_contents = candid_file.read()
317 ref_tokens = nltk.word_tokenize(ref_file_contents)
318 candid_tokens = nltk.word_tokenize(candid_file_contents)
319 from nltk.book import *
320 from nltk.corpus import stopwords
321 reffdist = FreqDist(ref_tokens)
322 candidfdist = FreqDist(candid_tokens)
323 stopwords = nltk.corpus.stopwords.words('english')
324 stopwords = stopwords + ['may', 'The', 'the', 'In', 'in','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','
R','S','T','
U','V','W','X','Y','Z']
325 puncts = ['.', '"', ',', '{', '}', '+', '-', '*', '/', '%', '&', '(', ')', '[', ']', '=', '@', '#', ':', '|', ';','\'s']
326 #at present tfidf filter is not applied
327 reffreqterms = [w for w in reffdist.keys() if w not in stopwords and w not in puncts and (reffdist.freq(w) * compute_idf(corpus, w))]
328 candfreqterms = [w for w in candidfdist.keys() if w not in stopwords and w not in puncts and (candidfdist.freq(w) * compute_idf(corpus, w))]
334 reftokensofthislevel = []
335 candtokensofthislevel = []
336 reftokensofprevlevel = []
337 candtokensofprevlevel = []
339 candcommontokens = []
340 refprevlevelsynsets = []
341 candprevlevelsynsets = []
342 ref_definitiongraphedges = []
343 cand_definitiongraphedges = []
344 from nltk.corpus import wordnet as wn
346 #recurse down to required depth and update edit distance between reference and candidate documents
347 while current_level < 3:
348 if current_level > 1:
349 for kw in reffreqterms:
350 refparents = parents(kw, refprevlevelsynsets)
352 ref_definitiongraphedges.append((kw, p))
353 for kw in candfreqterms:
354 candparents = parents(kw, candprevlevelsynsets)
355 for p in candparents:
356 cand_definitiongraphedges.append((kw, p))
357 print 'Current level:'
359 print 'Reference DefGraph:'
360 print ref_definitiongraphedges
361 print 'Candidate DefGraph:'
362 print cand_definitiongraphedges
363 #for refkeyword, candkeyword in reffreqterms, candfreqterms:
364 reffreqtermscount = 0
365 candfreqtermscount = 0
366 while reffreqtermscount < len(reffreqterms) or candfreqtermscount < len(candfreqterms):
367 #WSD - invokes Lesk's algorithm adapted to recursive gloss overlap- best_matching_synset()
368 if reffreqtermscount < len(reffreqterms):
369 refdisamb_synset = best_matching_synset(reffreqterms, wn.synsets(list(reffreqterms)[reffreqtermscount]))
370 print refdisamb_synset
371 refprevlevelsynsets = refprevlevelsynsets + [refdisamb_synset]
372 if candfreqtermscount < len(candfreqterms):
373 canddisamb_synset = best_matching_synset(candfreqterms, wn.synsets(list(candfreqterms)[candfreqtermscount]))
374 print canddisamb_synset
375 candprevlevelsynsets = candprevlevelsynsets + [canddisamb_synset]
376 if reffreqtermscount < len(reffreqterms) and len(wn.synsets(list(reffreqterms)[reffreqtermscount])) != 0:
377 refdisamb_synset_def = refdisamb_synset.definition
378 reftokens = nltk.word_tokenize(refdisamb_synset_def)
379 reffdist_tokens = FreqDist(reftokens)
380 #at present frequency filter is not applied
381 #if keyword in convergingterms:
382 reftokensofthislevel = reftokensofthislevel + ([w for w in reffdist_tokens.keys() if w not in stopwords and w not in puncts and reffdist_tokens.freq(w)])
383 if candfreqtermscount < len(candfreqterms) and len(wn.synsets(list(candfreqterms)[candfreqtermscount])) != 0:
384 canddisamb_synset_def = canddisamb_synset.definition
385 candtokens = nltk.word_tokenize(canddisamb_synset_def)
386 canddist_tokens = FreqDist(candtokens)
387 #at present frequency filter is not applied
388 #if keyword in convergingterms:
389 candtokensofthislevel = candtokensofthislevel + ([w for w in canddist_tokens.keys() if w not in stopwords and w not in puncts and canddist_tokens.freq(w)])
390 reffreqtermscount = reffreqtermscount + 1
391 candfreqtermscount = candfreqtermscount + 1
392 reflistcount = len(reftokensofthislevel)
393 candlistcount = len(candtokensofthislevel)
394 refsetcount = len(set(reftokensofthislevel))
395 candsetcount = len(set(candtokensofthislevel))
396 #find out common tokens of this and previous level so that same token does not get grasped again -
397 #relatedness must be increased since repetition of keywords in two successive levels is a sign of
398 #interrelatedness(a backedge from child-of-one-of-siblings to one-of-siblings). Remove vertices and edges #corresponding to common tokens
399 refcommontokens = set(reftokensofthislevel).intersection(set(reftokensofprevlevel))
400 candcommontokens = set(candtokensofthislevel).intersection(set(candtokensofprevlevel))
401 reftokensofthislevel = set(reftokensofthislevel).difference(refcommontokens)
402 candtokensofthislevel = set(candtokensofthislevel).difference(candcommontokens)
403 current_level = current_level + 1
404 reffreqterms = set(reftokensofthislevel)
405 candfreqterms = set(candtokensofthislevel)
406 reftokensofprevlevel = reftokensofthislevel
407 candtokensofprevlevel = candtokensofthislevel
408 reftokensofthislevel = []
409 candtokensofthislevel = []
410 for kw in reffreqterms:
411 refparents = parents(kw, refprevlevelsynsets)
413 ref_definitiongraphedges.append((kw, p))
414 for kw in candfreqterms:
415 candparents = parents(kw, candprevlevelsynsets)
416 for p in candparents:
417 cand_definitiongraphedges.append((kw, p))
418 print 'Current level:'
420 print 'Reference DefGraph:'
421 print ref_definitiongraphedges
422 print 'Candidate DefGraph:'
423 print cand_definitiongraphedges
425 #value addition - edges present in candidate but not in reference
426 valueaddition = set(cand_definitiongraphedges).difference(set(ref_definitiongraphedges))
427 #edit distance - edges added + edges removed
428 editdistance = editdistance + len(set(cand_definitiongraphedges).difference(set(ref_definitiongraphedges))) + len(set(ref_definitiongraphedges).difference(set(cand_definitiongraphedges)))
429 print 'Value addition:'
431 print 'Edit Distance:'