#!/usr/bin/env python # coding: utf-8 #
#
# *The goal is to identify as many words that meet the following criteria:*
#
# (1) *The word must be at least four letters long.*
#
# (2) *The word must include the central letter.*
#
# (3) *The word cannot include any letter beyond the seven given letters.*
#
# *Note that letters can be repeated. For example, the words GAME and AMALGAM are both acceptable words. Four-letter words are worth 1 point each, while five-letter words are worth 5 points, six-letter words are worth 6 points, seven-letter words are worth 7 points, etc. Words that use all of the seven letters in the honeycomb are known as “pangrams” and earn 7 bonus points (in addition to the points for the length of the word). So in the above example, MEGAPLEX is worth 15 points.*
#
# ***Which seven-letter honeycomb results in the highest possible game score?*** *To be a valid choice of seven letters, no letter can be repeated, it must not contain the letter S (that would be too easy) and there must be at least one pangram.*
#
# *For consistency, please use [this word list](https://norvig.com/ngrams/enable1.txt) to check your game score.*
#
# # My Approach
#
# Since the referenced word list came from **my** web site (I didn't make up the list; it is a standard Scrabble word list that I happen to host a copy of), I felt somewhat compelled to solve this one.
#
# Other word puzzles are hard because there are so many possibilities to consider.
# But fortunately the honeycomb puzzle (unlike [Boggle](https://github.com/aimacode/aima-python/blob/master/search.py) or [Scrabble](Scrabble.ipynb)) deals with *unordered sets* of letters, not *ordered permutations* of letters. So, once we exclude the "S", there are only (25 choose 7) = 480,700 *sets* of seven letters to consider. A brute force approach could evaluate all of them (probably over the course of multiple hours).
#
# Fortunately, I noticed a better trick. The rules say that every valid honeycomb must contain a pangram. Therefore, it must be the case that every valid honeycomb **is** a pangram. How many pangrams could there be in the word list—maybe 10,000? It must be a lot less than the number of sets of 7 letters.
#
# So here's a broad sketch of my approach:
#
# - The **best honeycomb** is the one with the highest game score among all candidate honeycombs.
# - A **candidate honeycomb** is any set of 7 letters that constitute a pangram word in the word list, with any one of the 7 letters as the center.
# - A **pangram word** is a word with exactly 7 distinct letters.
# - The **game score** for a honeycomb is the sum of the word scores for all the words that the honeycomb can make.
# - The **word score** of a word is 1 for four-letter words, or else $n$ for $n$-letters plus a 7-point bonus for pangrams.
# - A honeycomb **can make** a word if all the letters in the word are in the honeycomb, and the word contains the center letter.
# - The **set of letters** in a word (or honeycomb) can be represented as a sorted string of distinct letters (e.g., the set of letters in "AMALGAM" is "AGLM").
# - A **honeycomb** is defined by two things, the set of seven letters, and the distinguished single center letter.
# - The **word list** can ignore words that: are less than 4 letters long; have an S; or have more than 7 distinct letters.
#
# (Note: I could have used a `frozenset` to represent a set of letters, but a sorted string seemed simpler, and for debugging purposes, I'd rather be looking at `'AEGLMPX'` than at `frozenset({'A', 'E', 'G', 'L', 'M', 'P', 'X'})`).
#
# Each of these concepts can be implemented in a couple lines of code:
# In[1]:
def best_honeycomb(words) -> tuple:
"""Return (score, honeycomb) for the honeycomb with highest game score on these words."""
return max((game_score(h, words), h) for h in candidate_honeycombs(words))
def candidate_honeycombs(words):
"""The pangram lettersets, each with all 7 centers."""
pangrams = {letterset(w) for w in words if is_pangram(w)}
return (Honeycomb(pangram, center) for pangram in pangrams for center in pangram)
def is_pangram(word) -> bool:
"""Does a word have exactly 7 distinct letters?"""
return len(set(word)) == 7
def game_score(honeycomb, words) -> int:
"""The total score for this honeycomb; the sum of the word scores."""
return sum(word_score(word) for word in words if can_make(honeycomb, word))
def word_score(word) -> int:
"""The points for this word, including bonus for pangram."""
bonus = (7 if is_pangram(word) else 0)
return (1 if len(word) == 4 else len(word) + bonus)
def can_make(honeycomb, word) -> bool:
"""Can the honeycomb make this word?"""
(letters, center) = honeycomb
return center in word and all(L in letters for L in word)
def letterset(word) -> str:
"""The set of letters in a word, as a sorted string.
For example, letterset('GLAM') == letterset('AMALGAM') == 'AGLM'."""
return ''.join(sorted(set(word)))
def Honeycomb(letters, center) -> tuple: return (letters, center)
def wordlist(text) -> list:
"""A list of all the valid whitespace-separated words in text."""
return [w for w in text.upper().split()
if len(w) >= 4 and 'S' not in w and len(set(w)) <= 7]
# # Experimentation and Small Test
#
# I'll make a tiny word list and start experimenting with it:
# In[2]:
words = wordlist('amalgam amalgamation game games gem glam megaplex cacciatore erotica I me')
words
# Note that `I`, `me` and `gem` are too short, `games` has an `S` which is not allowed, and `amalgamation` has too many distinct letters (8). We're left with six valid words out of the original eleven. Here are examples of the functions in action:
# In[3]:
{w: word_score(w) for w in words}
# In[4]:
{w for w in words if is_pangram(w)}
# In[5]:
{w: letterset(w) for w in words}
# Note that AMALGAM and GLAM have the same letterset, as do CACCIATORE and EROTICA.
# In[6]:
honeycomb = Honeycomb('AEGLMPX', 'G')
# In[7]:
{w: word_score(w) for w in words if can_make(honeycomb, w)}
# In[8]:
game_score(honeycomb, words)
# In[9]:
best_honeycomb(words)
# **We're done!** We know how to find the best honeycomb. But so far, we've only done it for the tiny word list. Let's look at the real word list.
#
# # The enable1 Word List
#
# In[10]:
get_ipython().system(' [ -e enable1.txt ] || curl -O http://norvig.com/ngrams/enable1.txt')
get_ipython().system(' wc -w enable1.txt')
# In[11]:
enable1 = wordlist(open('enable1.txt').read())
len(enable1)
# In[12]:
pangrams = [w for w in enable1 if is_pangram(w)]
len(pangrams)
# In[13]:
pangram_sets = {letterset(w) for w in pangrams}
len(pangram_sets)
# In[14]:
_ * 7
# So to recap on the number of words of various types in enable1:
#
# 172,820 total words
# 44,585 valid words (eliminating "S" words, short words, 8+ letter words)
# 14,741 pangram words
# 7,986 unique pangram lettersets
# 55,902 candidate honeycombs
#
# How long will it take to run `best_honeycomb(enable1)`? Let's estimate by checking how long it takes to compute the game score of a single honeycomb:
# In[15]:
get_ipython().run_line_magic('time', 'game_score(honeycomb, enable1)')
# That's to compute one `game_score`. Multiply by 55,902 candidate honeycombs and we get somewhere in the 10 minute range. I could run `best_honeycomb(enable1)` right now and take a coffee break until it completes, but I'm predisposed to think that a puzzle like this deserves a more elegant solution. I know that [Project Euler](https://projecteuler.net/) designs their puzzles so that a good solution runs in less than a minute, so I'll make that my goal here.
#
# # Making it Faster
#
# Here's how I think about making a more efficient program:
#
# - We're doing a `game_score` for each of the 55,902 `candidate_honeycombs`.
# - `game_score` has to **look at each word in the wordlist, and test if it is a subset of the honeycomb.**
# - We can speed things up by flipping the test around: **look at each letter subset of the honeycomb, and test if it is in the word list.**
# - By **letter subset** I mean a letter set containing a subset of the letters in the honeycomb, and definitely containing the center. So, for `Honeycomb('ACEIORT', 'T')` the letter subsets are `['T', 'AT', 'CT', 'ET', 'IT', 'OT', 'RT', 'ACT', 'AET', ...]`
# - Why will flipping the test be faster? Because there are 44,585 words in the word list and only 64 letter subsets of a honeycomb. (A subset must include the center letter, and it may or may not include each of the other 6 letters, so there are exactly $2^6 = 64$ letter subsets of each pangram.)
# - We're left with the problem of deciding if a letter subset is a word. In fact, a letter subset might correspond to multiple words (e.g. `'AGLM'` corresponds to both `GLAM` and `AMALGAM`).
# - Ultimately we're more interested in the total number of points that a letter subset corresponds to, not in the individual word(s).
# - So I will create a table of `{letter_subset: total_points}` giving the total number of word score points for all the words that correspond to the letter subset. I call this a `points_table`.
# - Since the points table is independent of any honeycomb, I can compute it once and for all; I don't need to recompute it for each honeycomb.
# - To compute `game_score`, just take the sum of the 64 letter subset entries in the points table.
#
# Here's the code. Notice I didn't want to redefine the global function `game_score` with a different signature, so instead I made it be a local function that references the local `pts_table`,
# In[16]:
from collections import Counter, defaultdict
from itertools import combinations
def best_honeycomb(words) -> tuple:
"""Return (score, honeycomb) for the honeycomb with highest score on these words."""
pts_table = points_table(words)
def game_score(honeycomb) -> int:
return sum(pts_table[s] for s in letter_subsets(honeycomb))
return max((game_score(h), h) for h in candidate_honeycombs(words))
def points_table(words) -> dict:
"""Return a dict of {letterset: points} from words."""
table = Counter()
for w in words:
table[letterset(w)] += word_score(w)
return table
def letter_subsets(honeycomb) -> list:
"""The 64 subsets of the letters in the honeycomb (that must contain the center letter)."""
(letters, center) = honeycomb
return [''.join(subset)
for n in range(1, 8)
for subset in combinations(letters, n)
if center in subset]
# Let's get a feel for how this works. First the `letter_subsets`:
# In[17]:
# A 4-letter honeycomb makes 2**3 = 8 subsets; 7-letter honeycombs make 2**7 == 64
letter_subsets(('ABCD', 'C'))
# Now the `points_table`:
# In[18]:
words # Remind me again what the words are?
# In[19]:
points_table(words)
# The letterset `'ACEIORT'` gets 31 points, 17 for CACCIATORE and 14 for EROTICA, and the letterset `'AGLM'` gets 8 points, 7 for AMALGAM and 1 for GLAM. The other lettersets represent one word each.
# Let's test that `best_honeycomb(words)` gets the same answer as before, and that the points table has the same set of pangrams as before.
# In[20]:
assert best_honeycomb(words) == (31, ('ACEIORT', 'T'))
assert pangram_sets == {s for s in points_table(enable1) if len(s) == 7}
# # The Solution
# Finally, the solution to the puzzle:
# In[21]:
get_ipython().run_line_magic('time', 'best_honeycomb(enable1)')
# **Wow! 3898 is a high score!** And it took only 2 seconds to find it!
#
#
# # Making it Even Fasterer
#
# OK, that was 30 times faster than my goal of one minute. It was a nice optimization to look at only 64 letter subsets rather than 44,585 words. But I'm still looking at 103,187 honeycombs, and I feel that some of them are a waste of time. Consider the pangram "JUKEBOX". With the uncommon letters J, K, and X, it does not look like a high-scoring honeycomb, no matter what center we choose. So why waste time trying all seven centers? Here's the outline of a faster `best_honeycomb`:
#
# - Go through the pangrams as before
# - However, always keep track of the best score and the best honeycomb that we have found so far.
# - For each new pangram, first see how many points it would score if we ignore the restrriction that a particular center letter must be used. (I compute that with `game_score('')`, where again `game_score` is a local function,
# this time with access to both `pts_table` and `subsets`.)
# - Only if `game_score('')` is better than the best score found so far, then evaluate `game_score(C)` for each of the seven possible centers `C`.
# - In the end, return the best score and the best honeycomb.
# In[22]:
def best_honeycomb(words) -> tuple:
"""Return (score, honeycomb) for the honeycomb with highest score on these words."""
best_score, best_honeycomb = 0, None
pts_table = points_table(words)
pangrams = (s for s in pts_table if len(s) == 7)
for pangram in pangrams:
subsets = string_subsets(pangram)
def game_score(center): return sum(pts_table[s] for s in subsets if center in s)
if game_score('') > best_score:
for C in pangram:
if game_score(C) > best_score:
best_score, best_honeycomb = game_score(C), Honeycomb(pangram, C)
return (best_score, best_honeycomb)
def string_subsets(letters) -> list:
"""All subsets of a string."""
return [''.join(s)
for n in range(len(letters) + 1)
for s in combinations(letters, n)]
get_ipython().run_line_magic('time', 'best_honeycomb(enable1)')
# Looking good! We get the same answer, and in about half a second, four times faster than before.
# # Curiosity
#
# I'm curious about a bunch of things.
#
# What's the highest-scoring individual word?
# In[23]:
max(enable1, key=word_score)
# What are some of the pangrams?
# In[24]:
pangrams[::1000] # Every thousandth one
# What's the breakdown of reasons why words are invalid?
#
# In[25]:
Counter('S' if 'S' in w else '<4' if len(w) < 4 else '>7' if len(set(w)) > 7 else 'valid'
for w in open('enable1.txt').read().upper().split()).most_common()
# There are more than twice as many words with an 'S' as there are valid words.
#
# About the `points_table`: How many different letter subsets are there?
# In[26]:
pts = points_table(enable1)
len(pts)
# That means there's about two valid words for each letterset.
#
# Which lettersets score the most? The least?
# In[27]:
pts.most_common(20)
# In[28]:
pts.most_common()[-20:]
# # Fancy Report
#
# I'd like to see the actual words that each honeycomb can make, in addition to the total score, and I'm curious about how the words are divided up by letterset. Here's a function to provide such a report. I remembered that there is a `fill` function in Python (it is in the `textwrap` module) but this all turned out to be more complicated than I expected. I guess it is difficult to create a practical extraction and reporting tool. I feel you, Larry Wall.
# In[29]:
from textwrap import fill
def report(words, honeycomb=None):
"""Print stats, words, and word scores for the given honeycomb (or
for the best honeycomb if no honeycomb is given) over the given word list."""
optimal = ("" if honeycomb else "optimal ")
if honeycomb is None:
_, honeycomb = best_honeycomb(words)
subsets = letter_subsets(honeycomb)
bins = group_by(words, letterset)
score = sum(word_score(w) for w in words if letterset(w) in subsets)
nwords = sum(len(bins[s]) for s in subsets)
print(f'For this list of {Ns(len(words), "word")}:')
print(f'The {optimal}honeycomb {honeycomb} forms '
f'{Ns(nwords, "word")} for {Ns(score, "point")}.')
print(f'Here are the words formed by each subset, with pangrams first:\n')
for s in sorted(subsets, key=lambda s: (-len(s), s)):
if bins[s]:
pts = sum(word_score(w) for w in bins[s])
print(f'{s} forms {Ns(len(bins[s]), "word")} for {Ns(pts, "point")}:')
words = [f'{w}({word_score(w)})' for w in sorted(bins[s])]
print(fill(' '.join(words), width=80,
initial_indent=' ', subsequent_indent=' '))
def Ns(n, thing, plural=None):
"""Ns(3, 'bear') => '3 bears'; Ns(1, 'world') => '1 world'"""
return f"{n:,d} {thing if n == 1 else plurtal}"
def group_by(items, key):
"Group items into bins of a dict, each bin keyed by key(item)."
bins = defaultdict(list)
for item in items:
bins[key(item)].append(item)
return bins
# In[30]:
report(words, honeycomb)
# In[31]:
report(enable1)
# # S Words
#
# What if we allowed honeycombs (and words) to have an S?
# In[32]:
def S_words(text) -> list:
"""A list of all the valid space-separated words, including words with an S."""
return [w for w in text.upper().split()
if len(w) >= 4 and len(set(w)) <= 7]
# In[33]:
report(S_words(open('enable1.txt').read()))
# # Pictures
#
# Here are pictures for the highest-scoring honeycombs, with and without an S:
#
#