#!/usr/bin/env python # coding: utf-8 # # Panama Palindrome # # ## Utilities # In[39]: import random, re, bisect, time def is_palindrome(s): "Test if a string is a palindrome (only considering letters a-z)." s1 = canonical(s) return s1 == reversestr(s1) def is_unique_palindrome(s): "Test if string s is a palindrome where each comma-separated phrase is unique." return is_palindrome(s) and is_unique(phrases(s)) def canonical(word, sub=re.compile('[^a-z]').sub): "The canonical form for comparing: only lowercase a-z." return sub('', word.lower()) def phrases(s): "Break a string s into comma-separated phrases." return [phrase.strip() for phrase in s.split(',')] def reversestr(s): "Reverse a string." return s[::-1] def is_unique(collection): "Return true if collection has no duplicate elements." return len(collection) == len(set(collection)) # In[35]: def test_utils(): assert is_unique_palindrome('A man, a plan, a canal, Panama!') assert is_unique_palindrome('''A (man), a PLAN... a ``canal?'' -- Panama!''') assert not is_unique_palindrome('A man, a plan, a radar, a canal, Panama.') assert is_palindrome('A man, a plan, a canal, Panama.') assert is_palindrome('Radar. Radar? Radar!') assert not is_palindrome('radars') assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama'] assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama' assert reversestr('foo') == 'oof' assert is_unique([1, 2, 3]) assert not is_unique([1, 2, 2]) return 'test_utils passes' test_utils() # ## The Dictionary # In[6]: get_ipython().system(' [ -e npdict.txt ] || curl -O http://norvig.com/npdict.txt') # In[7]: get_ipython().system(' wc npdict.txt') # In[45]: ################ Reading in a dictionary class PalDict: """A dictionary with the following fields: words: a sorted list of words: ['ant', 'bee', 'sea'] rwords: a sorted list of reversed words: ['aes', 'eeb', 'tna'] truename: a dict of {canonical:true} pairs, e.g. {'anelk': 'an elk', 'anneelk': 'Anne Elk'} k: and the followng methods: """ def __init__(self, k=100, filename='npdict.txt'): words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'} for tword in open(filename, 'r', encoding='ascii', errors='ignore').read().splitlines(): word = canonical(tword) words.append(word) rwords.append(reversestr(word)) truename[word] = tword words.sort() rwords.sort() self.k = k self.words = words self.rwords = rwords self.truename = truename self.rangek = range(k) self.tryharder = False def startswith(self, prefix): """Return up to k canonical words that start with prefix. If there are more than k, choose from them at random.""" return self._k_startingwith(self.words, prefix) def endswith(self, rsuffix): """Return up to k canonical words that end with the reversed suffix. If you want words ending in 'ing', ask for d.endswith('gni'). If there are more than k, choose from them at random.""" return [reversestr(s) for s in self._k_startingwith(self.rwords, rsuffix)] def __contains__(self, word): return word in self.truename def _k_startingwith(self, words, prefix): start = bisect.bisect_left(words, prefix) end = bisect.bisect(words, prefix + 'zzzz') n = end - start if self.k >= n: # get all the words that start with prefix results = words[start:end] else: # sample from words starting with prefix indexes = random.sample(range(start, end), self.k) results = [words[i] for i in indexes] random.shuffle(results) ## Consider words that are prefixes of the prefix. ## This is very slow, so don't use it until late in the game. if self.tryharder: for i in range(3, len(prefix)): w = prefix[0:i] if ((words == self.words and w in self.truename) or (words == self.rwords and reversestr(w) in self.truename)): results.append(w) return results paldict = PalDict() def anpdictshort(): "Find the words that are valid when every phrase must start with 'a'" def segment(word): return [s for s in word.split('a') if s] def valid(word): return all(reversestr(s) in segments for s in segment(word)) words = [canonical(w) for w in open('anpdict.txt')] segments = set(s for w in words for s in segment(w)) valid_words = [paldict.truename[w] for w in words if valid(w)] file('anpdict-short2.txt', 'w').write('\n'.join(valid_words)) ################ Search for a palindrome class Panama: def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict): ## .left and .right hold lists of canonical words ## .diff holds the number of characters that are not matched, ## positive for words on left, negative for right. ## .stack holds (action, side, arg) tuples self.left = [] self.right = [] self.best = 0 self.seen = {} self.diff = 0 self.stack = [] self.starttime = time.clock() self.dict = dict self.steps = 0 for word in L.split(','): self.add('left', canonical(word)) for rword in reversestr(R).split(','): self.add('right', canonical(reversestr(rword))) self.consider_candidates() def search(self, steps=10*1000*1000): "Search for palindromes." for self.steps in range(steps): if not self.stack: return 'done' action, dir, substr, arg = self.stack[-1] if action == 'added': # undo the last word added self.remove(dir, arg) elif action == 'trying' and arg: # try the next word if there is one self.add(dir, arg.pop()) and self.consider_candidates() elif action == 'trying' and not arg: # otherwise backtrack self.stack.pop() else: raise ValueError(action) self.report() return self def add(self, dir, word): "add a word" if word in self.seen: return False else: getattr(self, dir).append(word) self.diff += factor[dir] * len(word) self.seen[word] = True self.stack.append(('added', dir, '?', word)) return True def remove(self, dir, word): "remove a word" oldword = getattr(self, dir).pop() assert word == oldword self.diff -= factor[dir] * len(word) del self.seen[word] self.stack.pop() def consider_candidates(self): """Push a new state with a set of candidate words onto stack.""" if self.diff > 0: # Left is longer, consider adding on right dir = 'right' substr = self.left[-1][-self.diff:] candidates = self.dict.endswith(substr) elif self.diff < 0: # Right is longer, consider adding on left dir = 'left' substr = reversestr(self.right[-1][0:-self.diff]) candidates = self.dict.startswith(substr) else: # Both sides are same size dir = 'left' substr = '' candidates = self.dict.startswith('') if substr == reversestr(substr): self.report() self.stack.append(('trying', dir, substr, candidates)) def report(self): "Report a new palindrome to log file (if it is sufficiently big)." N = len(self) if N > 13333: self.dict.tryharder = True if N > self.best and (N > 13000 or N > self.best+1000): self.best = len(self) self.bestphrase = str(self) print('%5d phrases (%5d words) in %3d seconds (%6d steps)' % ( self.best, self.bestphrase.count(' ')+1, time.clock() - self.starttime, self.steps)) assert is_unique_palindrome(self.bestphrase) def __len__(self): return len(self.left) + len(self.right) def __str__(self): truename = self.dict.truename lefts = [truename[w] for w in self.left] rights = [truename[w] for w in self.right] return ', '.join(lefts + rights[::-1]) def __repr__(self): return ''.format(len(self)) factor = {'left': +1, 'right': -1} # Note that we only allow one truename per canonical name. Occasionally # this means we miss a good word (as in "a node" vs. "an ode"), but there # are only 665 of these truename collisions, and most of them are of the # form "a mark-up" vs. "a markup" so it seemed better to disallow them. # In[30]: len(paldict.words) # In[46]: ################ Unit Tests def test2(p=PalDict()): d = p.dict def sameset(a, b): return set(a) == set(b) assert 'panama' in d assert d.words[0] in d assert d.words[-1] in d assert sameset(d.startswith('aword'), ['awording', 'awordbreak', 'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook', 'awordlessness', 'aword', 'awordsmith']) assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity', 'asubglobosity', 'anonverbosity', 'agibbosity']) d.tryharder = True assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla']) d.tryharder = False assert d.startswith('oklahoma') == ['oklahoma'] assert d.startswith('fsfdsfdsfds') == [] print('all tests pass') return p p = Panama() test2(p) p.search().report() p # In[ ]: # In[21]: get_ipython().system(' [ -e anpdict.txt ] || curl -O http://norvig.com/anpdict.txt') # In[26]: anpdictshort() # In[27]: get_ipython().system(' wc *npd*') # # Letter-By-Letter Approach # # Can we go letter-by-letter instead of word-by-word? Advantages: # # * We can (if need be) be exhaustive at each decision point, trying all 26 possibilities. # * We can try the most likely letters first. # # Process # # * Keep left- nad right- partial phrase lists; and the current state: # # {left: ['aman', 'aplan'], right: ['acanal', panama'], # left_word: True, right_word: True, extra_chars: +3, palindrome: True} # # * Now consider all ways of extending: # # - Add the letter `'a'` to the left, either as a new word or a continuation of the old word (perhaps going for `'a planaria'`). # - Add a letter, any letter, to the right, either as a new word or a continuation of # # # In[ ]: from collections import namedtuple def do(state, action, side, L): action(state, side, L) def add(state, side, L): getattr(state, side)[-1] += L def new(state, side, L): getattr(state, side).append(L) def undo(action, letter): if action == add: elif action == new: else: raise ValueError()