We'll read in some files, then index the "important" words in their contents, and finally search for some of those words
For more info and background info, please see:
https://julianspolymathexplorations.blogspot.com/2023/08/full-text-search-neo4j-indexing.html
test1.txt and one named test2.htm¶and place them on a local folder of your choice (make a note of its name!)
Contents of test1.txt:
hello to the world !!! ? Welcome to learning how she cooks with potatoes...
Contents of test2.htm:
Let's make a much better world, shall we? What do you say to that enticing prospect?
Starting on a small scale – we’ll learn cooking a potato well.
Also, change the value of the variable MY_FOLDER , below, to the location on your computer where you stored the above folders,
and use your database login credentials.
import set_path # Importing this module will add the project's home directory to sys.path
Added 'D:\Docs\- MY CODE\Brain Annex\BA-Win7' to sys.path
import os
import sys
import getpass
from neoaccess import NeoAccess
from BrainAnnex.modules.neo_schema.neo_schema import NeoSchema
from BrainAnnex.modules.full_text_indexing.full_text_indexing import FullTextIndexing
from BrainAnnex.modules.media_manager.media_manager import MediaManager
MY_FOLDER = "D:/tmp/tests for tutorials/" # ****** IMPORTANT: CHANGE AS NEEDED on your system; use forward slashes on Windows, too! ******
NOTE: This tutorial is tested on version 4.4 of the Neo4j database, but will probably also work on the new version 5
# Save your credentials here - or use the prompts given by the next cell
host = "YOUR_HOST"
password = "YOUR_PASSWORD"
print("To create a database connection, enter the host IP, but leave out the port number: (EXAMPLES: bolt://1.2.3.4 OR neo4j://localhost )\n")
host = input("Enter host IP WITHOUT the port number. EXAMPLE: bolt://123.456.789.012 ")
host += ":7687" # EXAMPLE of host value: "bolt://123.456.789.012:7687"
password = getpass.getpass("Enter the database password:")
print(f"\n=> Will be using: host='{host}', username='neo4j', password=**********")
To create a database connection, enter the host IP, but leave out the port number: (EXAMPLES: bolt://1.2.3.4 OR neo4j://localhost )
=> Will be using: host='bolt://123.456.789.012:7687', username='neo4j', password=**********
db = NeoAccess(host=host,
credentials=("neo4j", password), debug=False) # Notice the debug option being OFF
Connection to Neo4j database established.
print("Version of the Neo4j driver: ", db.version())
Version of the Neo4j driver: 4.4.11
db.empty_dbase() # ****** WARNING: USE WITH CAUTION!!! ******
# Verify that the database is empty
q = "MATCH (n) RETURN COUNT(n) AS number_nodes"
db.query(q, single_cell="number_nodes")
0
NeoSchema.set_database(db)
FullTextIndexing.db = db
FullTextIndexing.initialize_schema()
filename = "test1.txt" # 1st FILE
file_contents = MediaManager.get_from_text_file(path=MY_FOLDER, filename=filename)
file_contents
'hello to the world !!! ? Welcome to learning how she cooks with potatoes...'
word_list = FullTextIndexing.extract_unique_good_words(file_contents)
word_list # Not shown in any particular order
{'cooks', 'learning', 'potatoes', 'welcome', 'world'}
internal_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename})
internal_id
1963
# Index the chosen words for this first Content Item
FullTextIndexing.new_indexing(internal_id = internal_id, unique_words = word_list)
filename = "test2.htm" # 2nd FILE
file_contents = MediaManager.get_from_text_file(path=MY_FOLDER, filename=filename)
file_contents
"<p>Let's make a <i>much better world</i>, shall we? What do you say to that enticing prospect?</p>\n\n<p>Starting on a small scale – we’ll learn cooking a potato well.</p>"
word_list = FullTextIndexing.extract_unique_good_words(file_contents)
word_list
{'cooking', 'enticing', 'learn', 'potato', 'prospect', 'say', 'scale', 'world'}
internal_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename})
internal_id
1970
# Index the chosen words for this 2nd Content Item
FullTextIndexing.new_indexing(internal_id = internal_id, unique_words = word_list)
Here's what we have created so far (Note: THE INDEXED WORDS MIGHT VARY, BASED ON THE LATEST LIST OF COMMON WORDS TO DROP):

FullTextIndexing class¶def search_word(word :str) -> [str]:
"""
Look up any stored words that contains the requested one (ignoring case.)
Then locate the Content Items that are indexed by any of those words.
Return a list of the values of the "name" attributes in all the found Content Items
"""
q= f'''MATCH (w:Word)-[:occurs]->(:Indexer)<-[:has_index]-(ci:`Content Item`)
WHERE w.name CONTAINS toLower('{word}')
RETURN DISTINCT ci.name AS content_name
'''
result = db.query(q, single_column="content_name")
return result
search_word("world")
['test1.txt', 'test2.htm']
FullTextIndexing class:¶FullTextIndexing.search_word("world", all_properties=True)
[{'name': 'test1.txt', 'internal_id': 1963, 'neo4j_labels': ['Content Item']},
{'name': 'test2.htm', 'internal_id': 1970, 'neo4j_labels': ['Content Item']}]
For example, search for "potato" in order to find both "potato" and "potatoes".
search_word("POTATO")
['test1.txt', 'test2.htm']
search_word("POTATOES")
['test1.txt']
search_word("Learn")
['test1.txt', 'test2.htm']
search_word("Learning")
['test1.txt']
search_word("Supercalifragili")
[]
Currently supported: indexing of text files, HTML files (e.g., formatted notes) and PDF documents.