Krishna iResearch Intelligent Cloud Platform - acadpdrafts
InterviewAlgorithmWithIntrinisicMerit.py
1 #-------------------------------------------------------------------------`
2 #ASFER - a ruleminer which gets rules specific to a query and executes them
3 #
4 #This program is free software: you can redistribute it and/or modify
5 #it under the terms of the GNU General Public License as published by
6 #the Free Software Foundation, either version 3 of the License, or
7 #(at your option) any later version.
8 #This program is distributed in the hope that it will be useful,
9 #but WITHOUT ANY WARRANTY; without even the implied warranty of
10 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 #GNU General Public License for more details.
12 #You should have received a copy of the GNU General Public License
13 #along with this program. If not, see <http://www.gnu.org/licenses/>.
14 #
15 #-----------------------------------------------------------------------------------------------------------------------------------
16 #Copyright (C):
17 #Srinivasan Kannan (alias) Ka.Shrinivaasan (alias) Shrinivas Kannan
18 #Independent Open Source Developer, Researcher and Consultant
19 #Ph: 9003082186, 9791165980
20 #Open Source Products Profile(Krishna iResearch): http://sourceforge.net/users/ka_shrinivaasan
21 #Personal website(research): https://sites.google.com/site/kuja27/
22 #emails: ka.shrinivaasan@gmail.com, shrinivas.kannan@gmail.com, kashrinivaasan@live.com
23 #-----------------------------------------------------------------------------------------------------------------------------------
24 
25 #########################################################################################################
26 #Old version of Example Python code written more than 3 years ago in January 2010 during MSc thesis at IIT Chennai
27 #for Intrinsic Merit computation and Interview Algorithm
28 #For publications:
29 #1. http://arxiv.org/abs/1006.4458
30 #2. http://www.nist.gov/tac/publications/2010/participant.papers/CMI_IIT.proceedings.pdf
31 #
32 #Constructs wordnet subgraph from documents and computes similarity measures like edit distance
33 #########################################################################################################
34 
35 from __future__ import division
36 import pickle
37 #function - compute_idf()
38 def compute_idf(corpus, keyword):
39  import math
40  total_occur = 0
41  keyword_occur = 0
42  for file in corpus:
43  raw = open(file).read()
44  tokens = nltk.word_tokenize(raw)
45  total_occur = total_occur + len(tokens)
46  keyword_occur = keyword_occur + len([w for w in tokens if w == keyword])
47  return math.log(total_occur / (keyword_occur))
48 
49 #parents (at level i-1) of a given vertex at level i
50 #arguments are a keyword at present level and all disambiguated synsets of previous level
51 def parents(keyword, prevlevelsynsets):
52  parents = []
53  for syn in prevlevelsynsets:
54  if type(syn) is nltk.corpus.reader.wordnet.Synset:
55  syndef_tokens = set(nltk.word_tokenize(syn.definition))
56  if keyword in syndef_tokens:
57  parents = parents + [syn]
58  #output.write('Parents of ' + keyword + ' are:\n')
59  #pickle.dump(parents,output)
60  #output.write('\n')
61  return parents
62 
63 
64 #function - best_matching_synset()
65 def best_matching_synset(doc_tokens, synsets):
66  #output.write('best_matching_synset():\n')
67  maxmatch = -1
68  retset = []
69  for synset in synsets:
70  def_tokens = set(nltk.word_tokenize(synset.definition))
71  intersection = def_tokens.intersection(doc_tokens)
72  #output.write('--------------------')
73  #output.write('intersection:\n')
74  #pickle.dump(intersection, output)
75  #output.write('\n')
76  #output.write('--------------------')
77  if len(intersection) > maxmatch:
78  maxmatch = len(intersection)
79  retset = synset
80  #output.write(retset.definition)
81  return retset
82 
83 #function - get_context()
84 def get_context(query, documents):
85  file1 = open(documents[0])
86  file_contents = file1.read()
87  file_tokens = nltk.word_tokenize(file_contents)
88  try:
89  first_occur = file_tokens.index(query)
90  except ValueError:
91  return ""
92  context = ''
93  for i in file_tokens[first_occur-5:first_occur+5]:
94  context = context + ' ' + i
95  return context
96 
97 #function - get_jaccard_coefficient()
98 def get_jaccard_coefficient(refanswer, candidanswer):
99  total = len(set(shingles(refanswer) + shingles(candidanswer)))
100  intersect = len(set(shingles(refanswer)).intersection(set(shingles(candidanswer))))
101  return (intersect + 1) / (total + 1)
102 
103 #get shingles
104 def shingles(phrase):
105  return bigrams(nltk.word_tokenize(phrase))
106 
107 #----------------------------------------------
108 #1.Get the input documents
109 #----------------------------------------------
110 
111 corpus = ['/media/disk/CMI-related/ThesisDemo-english-test1.txt','/media/disk/CMI-related/ThesisDemo-english-test2.txt','/media/disk/CMI-related/ThesisDemo-english-test3.txt','/media/disk/CMI-related/ThesisDemo-carrace-test1.txt','/media/disk/CMI-related/ThesisDemo-carrace-test2.txt','/media/disk/CMI-related/ThesisDemo-carrace-test3.txt','/media/disk/CMI-related/ThesisDemo-datamining-test1.txt','/media/disk/CMI-related/ThesisDemo-datamining-test2.txt','/media/disk/CMI-related/ThesisDemo-datamining-test3.txt','/media/disk/CMI-related/ThesisDemo-datamining-test4.txt','/media/disk/CMI-related/ThesisDemo-datamining-test5.txt','/media/disk/CMI-related/ThesisDemo-graphtheory-test1.txt','/media/disk/CMI-related/ThesisDemo-graphtheory-test2.txt','/media/disk/CMI-related/ThesisDemo-graphtheory-test3.txt','/media/disk/CMI-related/ThesisDemo-fermatslasttheorem-test16.txt','/media/disk/CMI-related/ThesisDemo-fermatslasttheorem-test17.txt','/media/disk/CMI-related/ThesisDemo-car-test1.txt','ThesisDemo-newsmodipranab-test1.txt','ThesisDemo-newsmodipranab-test2.txt']
112 #get keywords
113 
114 import nltk
115 files = ['ThesisDemo-english-test1.txt','ThesisDemo-english-test2.txt','ThesisDemo-english-test3.txt','ThesisDemo-carrace-test1.txt','ThesisDemo-carrace-test2.txt','ThesisDemo-carrace-test3.txt','ThesisDemo-datamining-test1.txt','ThesisDemo-datamining-test2.txt','ThesisDemo-datamining-test3.txt','ThesisDemo-datamining-test4.txt','ThesisDemo-datamining-test5.txt','ThesisDemo-graphtheory-test1.txt','ThesisDemo-graphtheory-test2.txt','ThesisDemo-graphtheory-test3.txt','ThesisDemo-fermatslasttheorem-test16.txt','ThesisDemo-fermatslasttheorem-test17.txt']
116 
117 
118 """
119 #---------------------------------------------------------------------------------
120 #2.Compute intrinsic merit (either using linear or quadratic overlap)
121 #---------------------------------------------------------------------------------
122 
123 for filestr in files:
124  filestrtok = filestr.split('-')
125  outputfile = 'Output-' + filestrtok[1] + '-' + filestrtok[2]
126  output = open(outputfile, 'w')
127  file1 = open(filestr)
128  raw1 = file1.read()
129  doc1 = nltk.word_tokenize(raw1)
130  from nltk.book import *
131  from nltk.corpus import stopwords
132  fdist1 = FreqDist(doc1)
133  stopwords = nltk.corpus.stopwords.words('english')
134  stopwords = stopwords + ['may', 'The', 'the', 'In', 'in','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
135  puncts = ['.', '"', ',', '{', '}', '+', '-', '*', '/', '%', '&', '(', ')', '[', ']', '=', '@', '#', ':', '|', ';','\'s']
136  #at present tfidf filter is not applied
137  freqterms1 = [w for w in fdist1.keys() if w not in stopwords and w not in puncts and (fdist1.freq(w) * compute_idf(corpus, w))]
138 
139  current_level = 1
140  nodewithmaxparents = ''
141  noofparents = 0
142  maxparents = 0
143  relatedness = 0
144  first_convergence_level = 1
145  tokensofthislevel = []
146  convergingterms = []
147  convergingparents = []
148  tokensofprevlevel = []
149  prevlevelsynsets = []
150  commontokens = []
151  vertices = 0
152  edges = 0
153  overlap = 0
154  iter = 0
155  from nltk.corpus import wordnet as wn
156 
157  #recurse down to required depth and update intrinsic merit score
158  #relatedness is either sum(overlaps) or sum((overlapping_parents)*(overlaps)^2) also called convergence factor
159  while current_level < 3:
160  #crucial - gather nodes which converge/overlap (have more than 1 parent)
161  if current_level > 1:
162  convergingterms = [w for w in freqterms1 if len(parents(w,prevlevelsynsets)) > 1]
163  for kw in freqterms1:
164  convergingparents = convergingparents + ([w for w in parents(kw, prevlevelsynsets) if len(parents(kw, prevlevelsynsets)) > 1])
165  for kw in freqterms1:
166  noofparents = len(parents(kw, prevlevelsynsets))
167  if noofparents > maxparents:
168  maxparents = noofparents
169  nodewithmaxparents = kw
170  output.write('converging terms(terms with more than 1 parent):\n ')
171  #pickle.dump(convergingterms,output)
172  output.write('\n')
173  output.write('converging parents :\n')
174  #pickle.dump(convergingparents,output)
175  output.write('\n')
176  for keyword in freqterms1:
177  #WSD - invokes Lesk's algorithm adapted to recursive gloss overlap- best_matching_synset()
178  output.write('===============================================\n')
179  output.write('keyword : ' + keyword)
180  output.write('\n')
181  #disamb_synset = best_matching_synset(set(doc1), wn.synsets(keyword))
182  disamb_synset = best_matching_synset(freqterms1, wn.synsets(keyword))
183  prevlevelsynsets = prevlevelsynsets + [disamb_synset]
184  output.write('prevlevelsynsets:\n')
185  #pickle.dump(prevlevelsynsets, output)
186  output.write('\n')
187  output.write('matching synset:\n')
188  #pickle.dump( disamb_synset,output)
189  output.write('\n')
190  if len(wn.synsets(keyword)) != 0:
191  disamb_synset_def = disamb_synset.definition
192  tokens = nltk.word_tokenize(disamb_synset_def)
193  fdist_tokens = FreqDist(tokens)
194  #at present frequency filter is not applied
195  #if keyword in convergingterms:
196  tokensofthislevel = tokensofthislevel + ([w for w in fdist_tokens.keys() if w not in stopwords and w not in puncts and fdist_tokens.freq(w)])
197  output.write('At level:\n')
198  output.write(str(current_level))
199  output.write('\n')
200  output.write('tokens grasped at this level:\n')
201  #pickle.dump(tokensofthislevel, output)
202  output.write('\n')
203  listcount = len(tokensofthislevel)
204  setcount = len(set(tokensofthislevel))
205  overlap = listcount-setcount
206  if overlap > 0 and iter == 0 :
207  first_convergence_level = current_level
208  iter = 1
209  #choose between two relatedness/convergence criteria :-
210  #1) simple linear overlap or 2) zipf distributed quadratic overlap
211  #relatedness = relatedness + len(convergingparents)*overlap
212  relatedness = relatedness + overlap + len(convergingparents)
213  #relatedness = relatedness + ((len(convergingparents)*overlap*overlap) + 1)
214  #find out common tokens of this and previous level so that same token does not get grasped again -
215  #relatedness must be increased since repetition of keywords in two successive levels is a sign of
216  #interrelatedness(a backedge from child-of-one-of-siblings to one-of-siblings). Remove vertices and edges #corresponding to common tokens
217  commontokens = set(tokensofthislevel).intersection(set(tokensofprevlevel))
218  tokensofthislevel = set(tokensofthislevel).difference(commontokens)
219  relatedness = relatedness + len(commontokens)
220  output.write('removing tokens already grasped:\n')
221  #pickle.dump(commontokens,output)
222  output.write('\n')
223  output.write('Relatedness:\n')
224  output.write(str(relatedness))
225  output.write('\n')
226  #decrease the vertices count to address common tokens removed above - edges should remain same since they
227  #would just point elsewhere
228  vertices = vertices + setcount - len(commontokens)
229  output.write('Vertices:\n')
230  output.write(str(vertices))
231  output.write('\n')
232  edges = edges + listcount
233  output.write('Edges:\n')
234  output.write(str(edges))
235  output.write('\n')
236  current_level = current_level + 1
237  freqterms1 = set(tokensofthislevel)
238  tokensofprevlevel = tokensofthislevel
239  tokensofthislevel = []
240 
241  intrinsic_merit = vertices*edges*relatedness / first_convergence_level
242  output.write('Intrinsic merit of this document is:\n')
243  output.write(str(intrinsic_merit))
244  output.write('\n')
245  output.write('Node with maximum parents (and hence the most likely class of document) is:\n')
246  output.write(nodewithmaxparents)
247  output.write('\n')
248 """
249 #--------------------------------
250 #3. Interview Algorithm
251 #--------------------------------
252 """
253 references = ['ThesisDemo-carrace-test1.txt']
254 candidate = ['ThesisDemo-carrace-test2.txt']
255 queries = []
256 stopwords = nltk.corpus.stopwords.words('english')
257 stopwords = stopwords + ['may', 'The', 'the', 'In', 'in','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
258 puncts = ['.', '"', ',', '{', '}', '+', '-', '*', '/', '%', '&', '(', ')', '[', ']', '=', '@', '#', ':', '|', ';','\'s']
259 from nltk.book import *
260 for filestr in references:
261  file1 = open(filestr)
262  filecontents = file1.read()
263  filetokens = nltk.word_tokenize(filecontents)
264  freqdist = FreqDist(filetokens)
265  queries = queries + [w for w in freqdist.keys() if w not in stopwords and w not in puncts and freqdist.freq(w) * compute_idf(corpus, w) > 0.1]
266 
267 print 'Most important tokens/ngrams = queries :'
268 print queries
269 
270 ref_qandamap={}
271 candid_qandamap={}
272 for q in queries:
273  ref_qandamap[q] = get_context(q, references)
274 
275 candid_file = open(candidate[0])
276 candid_file_contents = candid_file.read()
277 candid_tokens = nltk.word_tokenize(candid_file_contents)
278 candid_text = nltk.Text(candid_tokens)
279 for q in queries:
280  #print '--------------------------------'
281  #print 'concordance for :' + q
282  #candid_text.concordance(q)
283  #print '--------------------------------'
284  candid_qandamap[q] = get_context(q, candidate)
285 
286 print 'reference:'
287 print ref_qandamap
288 print 'candidate:'
289 print candid_qandamap
290 
291 #compute jaccard coefficient score for each question and answer
292 scores=[]
293 i=0
294 for q in queries:
295  print 'q = ' + q
296  scores.append(get_jaccard_coefficient(ref_qandamap[q], candid_qandamap[q]))
297 
298 x=0
299 sumscore = 0
300 for x in scores:
301  sumscore = sumscore + x
302 
303 print 'Interview score:'
304 print sumscore
305 """
306 #--------------------------------------------------------------------------------------------------------------------
307 #4.Compute Value Addition applying edit distance between definition-graph(reference) and definition-graph(candidate)
308 #at present limited to 1 reference which will be updated after interview algorithm finishes if necessary
309 #--------------------------------------------------------------------------------------------------------------------
310 references = ['ThesisDemo-newsmodipranab-test1.txt']
311 candidate = ['ThesisDemo-newsmodipranab-test2.txt']
312 
313 ref_file = open(references[0])
314 candid_file = open(candidate[0])
315 ref_file_contents = ref_file.read()
316 candid_file_contents = candid_file.read()
317 ref_tokens = nltk.word_tokenize(ref_file_contents)
318 candid_tokens = nltk.word_tokenize(candid_file_contents)
319 from nltk.book import *
320 from nltk.corpus import stopwords
321 reffdist = FreqDist(ref_tokens)
322 candidfdist = FreqDist(candid_tokens)
323 stopwords = nltk.corpus.stopwords.words('english')
324 stopwords = stopwords + ['may', 'The', 'the', 'In', 'in','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
325 puncts = ['.', '"', ',', '{', '}', '+', '-', '*', '/', '%', '&', '(', ')', '[', ']', '=', '@', '#', ':', '|', ';','\'s']
326 #at present tfidf filter is not applied
327 reffreqterms = [w for w in reffdist.keys() if w not in stopwords and w not in puncts and (reffdist.freq(w) * compute_idf(corpus, w))]
328 candfreqterms = [w for w in candidfdist.keys() if w not in stopwords and w not in puncts and (candidfdist.freq(w) * compute_idf(corpus, w))]
329 
330 refkeyword = ''
331 candkeyword = ''
332 current_level = 1
333 editdistance = 0
334 reftokensofthislevel = []
335 candtokensofthislevel = []
336 reftokensofprevlevel = []
337 candtokensofprevlevel = []
338 refcommontokens = []
339 candcommontokens = []
340 refprevlevelsynsets = []
341 candprevlevelsynsets = []
342 ref_definitiongraphedges = []
343 cand_definitiongraphedges = []
344 from nltk.corpus import wordnet as wn
345 
346 #recurse down to required depth and update edit distance between reference and candidate documents
347 while current_level < 3:
348  if current_level > 1:
349  for kw in reffreqterms:
350  refparents = parents(kw, refprevlevelsynsets)
351  for p in refparents:
352  ref_definitiongraphedges.append((kw, p))
353  for kw in candfreqterms:
354  candparents = parents(kw, candprevlevelsynsets)
355  for p in candparents:
356  cand_definitiongraphedges.append((kw, p))
357  print 'Current level:'
358  print current_level
359  print 'Reference DefGraph:'
360  print ref_definitiongraphedges
361  print 'Candidate DefGraph:'
362  print cand_definitiongraphedges
363  #for refkeyword, candkeyword in reffreqterms, candfreqterms:
364  reffreqtermscount = 0
365  candfreqtermscount = 0
366  while reffreqtermscount < len(reffreqterms) or candfreqtermscount < len(candfreqterms):
367  #WSD - invokes Lesk's algorithm adapted to recursive gloss overlap- best_matching_synset()
368  if reffreqtermscount < len(reffreqterms):
369  refdisamb_synset = best_matching_synset(reffreqterms, wn.synsets(list(reffreqterms)[reffreqtermscount]))
370  print refdisamb_synset
371  refprevlevelsynsets = refprevlevelsynsets + [refdisamb_synset]
372  if candfreqtermscount < len(candfreqterms):
373  canddisamb_synset = best_matching_synset(candfreqterms, wn.synsets(list(candfreqterms)[candfreqtermscount]))
374  print canddisamb_synset
375  candprevlevelsynsets = candprevlevelsynsets + [canddisamb_synset]
376  if reffreqtermscount < len(reffreqterms) and len(wn.synsets(list(reffreqterms)[reffreqtermscount])) != 0:
377  refdisamb_synset_def = refdisamb_synset.definition
378  reftokens = nltk.word_tokenize(refdisamb_synset_def)
379  reffdist_tokens = FreqDist(reftokens)
380  #at present frequency filter is not applied
381  #if keyword in convergingterms:
382  reftokensofthislevel = reftokensofthislevel + ([w for w in reffdist_tokens.keys() if w not in stopwords and w not in puncts and reffdist_tokens.freq(w)])
383  if candfreqtermscount < len(candfreqterms) and len(wn.synsets(list(candfreqterms)[candfreqtermscount])) != 0:
384  canddisamb_synset_def = canddisamb_synset.definition
385  candtokens = nltk.word_tokenize(canddisamb_synset_def)
386  canddist_tokens = FreqDist(candtokens)
387  #at present frequency filter is not applied
388  #if keyword in convergingterms:
389  candtokensofthislevel = candtokensofthislevel + ([w for w in canddist_tokens.keys() if w not in stopwords and w not in puncts and canddist_tokens.freq(w)])
390  reffreqtermscount = reffreqtermscount + 1
391  candfreqtermscount = candfreqtermscount + 1
392  reflistcount = len(reftokensofthislevel)
393  candlistcount = len(candtokensofthislevel)
394  refsetcount = len(set(reftokensofthislevel))
395  candsetcount = len(set(candtokensofthislevel))
396  #find out common tokens of this and previous level so that same token does not get grasped again -
397  #relatedness must be increased since repetition of keywords in two successive levels is a sign of
398  #interrelatedness(a backedge from child-of-one-of-siblings to one-of-siblings). Remove vertices and edges #corresponding to common tokens
399  refcommontokens = set(reftokensofthislevel).intersection(set(reftokensofprevlevel))
400  candcommontokens = set(candtokensofthislevel).intersection(set(candtokensofprevlevel))
401  reftokensofthislevel = set(reftokensofthislevel).difference(refcommontokens)
402  candtokensofthislevel = set(candtokensofthislevel).difference(candcommontokens)
403  current_level = current_level + 1
404  reffreqterms = set(reftokensofthislevel)
405  candfreqterms = set(candtokensofthislevel)
406  reftokensofprevlevel = reftokensofthislevel
407  candtokensofprevlevel = candtokensofthislevel
408  reftokensofthislevel = []
409  candtokensofthislevel = []
410 for kw in reffreqterms:
411  refparents = parents(kw, refprevlevelsynsets)
412  for p in refparents:
413  ref_definitiongraphedges.append((kw, p))
414 for kw in candfreqterms:
415  candparents = parents(kw, candprevlevelsynsets)
416  for p in candparents:
417  cand_definitiongraphedges.append((kw, p))
418 print 'Current level:'
419 print current_level
420 print 'Reference DefGraph:'
421 print ref_definitiongraphedges
422 print 'Candidate DefGraph:'
423 print cand_definitiongraphedges
424 
425 #value addition - edges present in candidate but not in reference
426 valueaddition = set(cand_definitiongraphedges).difference(set(ref_definitiongraphedges))
427 #edit distance - edges added + edges removed
428 editdistance = editdistance + len(set(cand_definitiongraphedges).difference(set(ref_definitiongraphedges))) + len(set(ref_definitiongraphedges).difference(set(cand_definitiongraphedges)))
429 print 'Value addition:'
430 print valueaddition
431 print 'Edit Distance:'
432 print editdistance
433