document.write('');
document.write('');
document.write('');
document.write('');
document.write('Paste provided by Paste.ee - View Original - View Raw - Download
');
document.write('
from typing import Tuple\nimport random\nimport math\nimport os\nimport sys\nimport operator\n\ndef lower_bound(a, x, lo=0, hi=None):\n """Return the index where to insert item x in list a, assuming a is sorted.\n\n The return value i is such that all e in a[:i] have e < x, and all e in\n a[i:] have e >= x. So if x already appears in the list, a.insert(x) will\n insert just before the leftmost x already there.\n\n Optional args lo (default 0) and hi (default len(a)) bound the\n slice of a to be searched.\n """\n\n L=len(x)\n\n if lo < 0:\n raise ValueError(\'lo must be non-negative\')\n if hi is None:\n hi = len(a)\n while lo < hi:\n mid = (lo+hi)//2\n if a[mid][:L] < x: lo = mid+1\n else: hi = mid\n return lo\n\ndef upper_bound(a, x, lo=0, hi=None):\n """Return the index where to insert item x in list a, assuming a is sorted.\n\n The return value i is such that all e in a[:i] have e <= x, and all e in\n a[i:] have e > x. So if x already appears in the list, a.insert(x) will\n insert just after the rightmost x already there.\n\n Optional args lo (default 0) and hi (default len(a)) bound the\n slice of a to be searched.\n """\n L=len(x)\n\n if lo < 0:\n raise ValueError(\'lo must be non-negative\')\n if hi is None:\n hi = len(a)\n while lo < hi:\n mid = (lo+hi)//2\n if x < a[mid][:L]: hi = mid\n else: lo = mid+1\n return lo\n\nclass Tree(object):\n """\n Our tree implementation.\n """\n\n def __init__(self, scanWindowSz):\n self.scanWindowSz = scanWindowSz\n self.lenRecords = 0 # number of records\n self.records = [] # all hypo records made from entailData for fast search of scanWindow or verification\n self.candidates = []\n\n def build(self, text:list):\n print("Building Tree...")\n\n def make_record(word_list):\n res = ""\n for word in word_list:\n res += word.lower() + " "\n return res.strip()\n\n length = len(text)\n self.records = []\n\n for i in range(0, length):\n windowSize= min(self.scanWindowSz, length - i)\n shouldInsert = True\n for word in text[i:i+windowSize]:\n if not(word.isalpha() or (word[:-1].isalpha() and (word[-1]==\',\' or word[-1]==\'.\'))):\n shouldInsert = False\n break\n if shouldInsert == True:\n self.records.append(make_record(text[i:i+windowSize]))\n\n self.records.sort()\n self.lenRecords = len(self.records)\n\n # print(self.records)\n # print("Building Tree done")\n\n def find_interval(self, searchStr, lo=0, hi=None):\n if (hi == None):\n hi = self.lenRecords\n fro = lower_bound(self.records, searchStr, lo, hi)\n to = upper_bound(self.records, searchStr, lo, hi)\n return fro, to\n\n def mergeCandidates(self, candidates, idx):\n # print("before", self.candidates)\n self.candidates = []\n if (len(candidates) == 0):\n return\n\n def isSameCluster(a, b, verifyLen):\n a = a.split()\n b = b.split()\n if len(a) < verifyLen + 1 or len(b) < verifyLen + 1:\n return False\n \n for i in range (0, verifyLen+1):\n if a[i] != b[i]:\n return False\n return True\n\n\n candidates.sort(key = operator.itemgetter(0, 1))\n self.candidates.append(candidates[0])\n for candy in candidates:\n if candy[0] > self.candidates[-1][1]:\n self.candidates.append(candy)\n else:\n if isSameCluster(self.records[self.candidates[-1][0]], self.records[candy[0]], idx):\n self.candidates[-1] = (self.candidates[-1][0], max(self.candidates[-1][1], candy[1]))\n else:\n self.candidates.append(candy)\n\n\n def update_words(self, idx:int, word_list:list):\n # print(idx, word_list)\n if idx == 0:\n candidates = []\n for searchStr in word_list:\n fro, to = self.find_interval(searchStr + " ")\n if fro < to:\n candidates.append((fro, to))\n else:\n candidates = []\n for (fro, to) in self.candidates:\n for word in word_list:\n searchStr = \' \'.join(self.records[fro].split()[:idx]) + " " + word + " "\n froNew, toNew = self.find_interval(searchStr, fro, to)\n if froNew < toNew:\n candidates.append((froNew, toNew))\n\n # print(candidates)\n self.mergeCandidates(candidates, idx)\n # print(self.candidates)\n\n def dist(self, a, b):\n L = min(len(a), len(b))\n nSame = 0\n pattern = 0\n for i in range(0, L):\n if a[i] == b[i]:\n nSame = nSame + 1\n pattern = pattern + (1<<i)\n \n return (nSame, pattern)\n\n def get_words(self, idx:int):\n """\n return word list of vanila serarch ex. [[down the street], [up the street], [down the stair], ...]\n here predicted next word is [street, stair, ...]\n """\n\n resultSet = []\n for (fro, to) in self.candidates:\n for sequence in self.records[fro:to]:\n word_list = sequence.split()\n if (len(word_list) > idx):\n resultSet.append(\' \'.join(word_list[:idx+1]))\n\n result = []\n for hp in resultSet:\n words = hp.split()\n if words[-1][-1]==\',\' or words[-1][-1]==\'.\':\n words[-1] = words[-1][:-1]\n if words[-1] == "":\n continue\n result.append(words)\n\n return result\n\n def is_contain(self, searchStr:str) -> bool:\n """\n check if a tree contain searchStr\n """\n fro, to = self.find_interval(searchStr)\n if fro < to:\n return True\n else:\n return False\n\n def calc_frequency(self, searchStr:str) -> int:\n """\n calculate the number of occurances of search str in entailData\n """\n fro, to = self.find_interval(searchStr)\n if fro < to:\n return to-fro\n else:\n return 0\n\n def calc_most_frequent_next_word(self, searchStr:str):\n """\n calcuate the most frequent next word of a SearchStr ex. "down the " -> "path, 15"\n """\n L = len(searchStr.strip().split())\n fro, to = self.find_interval(searchStr)\n\n cur_word = ""\n cur_cnt = 0\n max_cnt = 0\n max_word = ""\n for sequence in self.records[fro:to]:\n word_list = sequence.split()\n if (len(word_list) <= L):\n continue\n\n word = word_list[L]\n if word == cur_word:\n cur_cnt += 1\n else:\n if cur_cnt > max_cnt:\n max_cnt = cur_cnt\n max_word = cur_word\n\n cur_word = word\n cur_cnt = 1\n \n if cur_cnt > max_cnt:\n max_cnt = cur_cnt\n max_word = cur_word\n\n return (max_word, max_cnt)\n\n\nif __name__ == "__main__":\n # tree = Tree(3)\n # tree.build(\'hello wor*ld how are you how are How do yo-u do \')\n\n # print(tree.is_contain("hello wOr*ld how ")) # True\n # print(tree.is_contain("how do yo-u do ")) # True\n # print(tree.is_contain("wor*LD how a")) # True\n \n # print(tree.find_nextword("how do yo-u ")) # do\n # print(tree.find_nextword("how do your ")) # None\n # print(tree.find_nextword("how are ")) # you or how\n # print(tree.find_nextword("how are ")) # you or how\n\n import smart_open\n import pickle\n\n entailData = sys.argv[1] # entailData to make tree\n\n tree = Tree(4, 3, "SplitedWindows")\n words = []\n with smart_open.smart_open(entailData, \'r\') as entail:\n for line in entail:\n words.extend(line.strip().split())\n\n tree.build(words)\n print(tree.records)\n\n print("Saving Tree to \"Tree\"")\n with open("Tree", \'wb\') as outfile:\n pickle.dump(tree, outfile)\n\n print("Saving Tree done")\n\n tree.update_words(0, ["today", "Are", "Earth", "hit", "a"])\n tree.update_words(1, ["meteor,", "Earth,"])\n print(tree.get_words(2))\n\n print(tree.is_contain("meteor, "))\n print(tree.is_contain("a "))\n print(tree.is_contain("a, "))\n\n print(tree.calc_frequency("meteor, "))\n print(tree.calc_frequency("a "))\n print(tree.calc_frequency("a, ")) \n print("yes yes yes yes", tree.calc_frequency("yes yes yes yes"))
');
function initEmbeddedPaste_eJ2CrdDDoyYkFh4q() {
hljs.highlightBlock(document.getElementById('pastee-eJ2CrdDDoyYkFh4q-content'));
}
addEventListener('DOMContentLoaded', initEmbeddedPaste_eJ2CrdDDoyYkFh4q, false);
addEventListener('load', initEmbeddedPaste_eJ2CrdDDoyYkFh4q, false);