#!/usr/bin/env python # coding: utf-8 # ## Tutorial : Exploration of full-text indexing # We'll read in some files, then index the "important" words in their contents, and finally search for some of those words # # For more info and background info, please see: # https://julianspolymathexplorations.blogspot.com/2023/08/full-text-search-neo4j-indexing.html # In[1]: import set_path # Importing this module will add the project's home directory to sys.path # In[2]: import os import sys import getpass from neoaccess import NeoAccess from BrainAnnex.modules.neo_schema.neo_schema import NeoSchema from BrainAnnex.modules.full_text_indexing.full_text_indexing import FullTextIndexing from BrainAnnex.modules.media_manager.media_manager import MediaManager # # Connect to the database # #### You can use a free local install of the Neo4j database, or a remote one on a virtual machine under your control, or a hosted solution, or simply the FREE "Sandbox" : [instructions here](https://julianspolymathexplorations.blogspot.com/2023/03/neo4j-sandbox-tutorial-cypher.html) # NOTE: This tutorial is tested on version 4 of the Neo4j database, but will probably also work on the new version 5# Connect to the database # In[3]: # Save your credentials here - or use the prompts given by the next cell host = "" # EXAMPLES: bolt://123.456.789.012 OR neo4j://localhost password = "" # In[4]: print("To create a database connection, enter the host IP, but leave out the port number: (EXAMPLES: bolt://123.456.789.012 OR neo4j://localhost )\n") host = input("Enter host IP WITHOUT the port number. EXAMPLE: bolt://123.456.789.012 ") host += ":7687" # EXAMPLE of host value: "bolt://123.456.789.012:7687" password = getpass.getpass("Enter the database password:") print(f"\n=> Will be using: host='{host}', username='neo4j', password=**********") # In[ ]: # In[5]: db = NeoAccess(host=host, credentials=("neo4j", password), debug=False) # Notice the debug option being OFF # In[6]: print("Version of the Neo4j driver: ", db.version()) # # Explorations of Indexing # In[7]: # Verify that the database is empty (if necessary, use db.empty_dbase() to clear it) q = "MATCH (n) RETURN COUNT(n) AS number_nodes" db.query(q, single_cell="number_nodes") # #### Initialize the indexing # In[8]: NeoSchema.set_database(db) FullTextIndexing.db = db # In[9]: MediaManager.set_media_folder("D:/tmp/") # CHANGE AS NEEDED on your system # In[10]: db.empty_dbase() # WARNING: USE WITH CAUTION!!! # In[11]: FullTextIndexing.initialize_schema() # In[ ]: # #### Read in 2 files (stored in the "media folder" specified above), and index them # In[12]: filename = "test1.txt" # 1st FILE file_contents = MediaManager.get_from_file(filename) file_contents # In[13]: word_list = FullTextIndexing.extract_unique_good_words(file_contents) word_list # #### Note that many common words get dropped... # In[14]: content_item_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename}) content_item_id # In[15]: # Index the chosen words for this first Content Item FullTextIndexing.new_indexing(content_item_id = content_item_id, unique_words = word_list) # #### Process the 2nd Content Item # In[16]: filename = "test2.htm" # 2nd FILE file_contents = MediaManager.get_from_file(filename) file_contents # In[17]: word_list = FullTextIndexing.extract_unique_good_words(file_contents) word_list # In[18]: content_item_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename}) content_item_id # In[19]: # Index the chosen words for this 2nd Content Item FullTextIndexing.new_indexing(content_item_id = content_item_id, unique_words = word_list) # _Here's what we have created so far:_ # ![Full Text Indexing](../BrainAnnex/docs/tutorial_full_text_indexing.png) # In[ ]: # ### The following function provides a simple way to search content that includes a given word in the index, for demonstration purpose; for actual use, please see the methods provided by the `FullTextIndexing` class # In[20]: def search_word(word :str) -> [str]: """ Look up any stored words that contains the requested one (ignoring case.) Then locate the Content Items that are indexed by any of those words. Return a list of the values of the "name" attributes in all the found Content Items """ q= f'''MATCH (w:Word)-[:occurs]->(:Indexer)<-[:has_index]-(ci:`Content Item`) WHERE w.name CONTAINS toLower('{word}') RETURN DISTINCT ci.name AS content_name ''' result = db.query(q, single_column="content_name") return result # In[ ]: # # Now, can finally try out some word searches # ### Using the search_word() function above: # In[21]: search_word("hello") # In[22]: search_word("world") # ### Or using methods provided by the `FullTextIndexing` class: # In[23]: FullTextIndexing.search_word("world", all_properties=True) # In[ ]: # ### IMPORTANT: make sure to search for the word STEMS, in order to find all variants!! # For example, search for "potato" in order to find both "potato" and "potatoes". # In[24]: search_word("POTATO") # In[25]: search_word("POTATOES") # In[26]: search_word("Learn") # In[27]: search_word("Learning") # In[ ]: