#!/usr/bin/env python # coding: utf-8 # ## Tutorial : Exploration of full-text indexing # We'll read in some files, then index the "important" words in their contents, and finally search for some of those words # # For more info and background info, please see: # https://julianspolymathexplorations.blogspot.com/2023/08/full-text-search-neo4j-indexing.html # # #### CAUTION: running this tutorial will clear out the database! # # --- # # ## PREPARATIONS: to run this tutorial, create a text file named `test1.txt` and one named `test2.htm` # and place them on a local folder of your choice (make a note of its name!) # # --- # **Contents of test1.txt:** # hello to the world !!! ? Welcome to learning how she cooks with potatoes... # # **Contents of test2.htm:** #

Let's make a much better world, shall we? What do you say to that enticing prospect?

# #

Starting on a small scale – we’ll learn cooking a potato well.

# # --- # Also, change the value of the variable `MY_FOLDER` , below, to the location on your computer where you stored the above folders, # and use your database login credentials. # In[1]: import set_path # Importing this module will add the project's home directory to sys.path # In[2]: import os import sys import getpass from neoaccess import NeoAccess from BrainAnnex.modules.neo_schema.neo_schema import NeoSchema from BrainAnnex.modules.full_text_indexing.full_text_indexing import FullTextIndexing from BrainAnnex.modules.media_manager.media_manager import MediaManager # In[3]: MY_FOLDER = "D:/tmp/tests for tutorials/" # ****** IMPORTANT: CHANGE AS NEEDED on your system; use forward slashes on Windows, too! ****** # In[ ]: # # Connect to the database # #### You can use a free local install of the Neo4j database, or a remote one on a virtual machine under your control, or a hosted solution, or simply the FREE "Sandbox" : [instructions here](https://julianspolymathexplorations.blogspot.com/2023/03/neo4j-sandbox-tutorial-cypher.html) # NOTE: This tutorial is tested on version 4.4 of the Neo4j database, but will probably also work on the new version 5 # In[4]: # Save your credentials here - or use the prompts given by the next cell host = "YOUR_HOST" password = "YOUR_PASSWORD" # In[2]: print("To create a database connection, enter the host IP, but leave out the port number: (EXAMPLES: bolt://1.2.3.4 OR neo4j://localhost )\n") host = input("Enter host IP WITHOUT the port number. EXAMPLE: bolt://123.456.789.012 ") host += ":7687" # EXAMPLE of host value: "bolt://123.456.789.012:7687" password = getpass.getpass("Enter the database password:") print(f"\n=> Will be using: host='{host}', username='neo4j', password=**********") # In[ ]: # In[5]: db = NeoAccess(host=host, credentials=("neo4j", password), debug=False) # Notice the debug option being OFF # In[6]: print("Version of the Neo4j driver: ", db.version()) # # Explorations of Indexing # In[7]: db.empty_dbase() # ****** WARNING: USE WITH CAUTION!!! ****** # In[8]: # Verify that the database is empty q = "MATCH (n) RETURN COUNT(n) AS number_nodes" db.query(q, single_cell="number_nodes") # #### Initialize the indexing # In[9]: NeoSchema.set_database(db) FullTextIndexing.db = db # In[10]: FullTextIndexing.initialize_schema() # In[ ]: # #### Read in 2 files (stored in the MY_FOLDER location specified above), and index them # In[11]: filename = "test1.txt" # 1st FILE file_contents = MediaManager.get_from_text_file(path=MY_FOLDER, filename=filename) file_contents # In[12]: word_list = FullTextIndexing.extract_unique_good_words(file_contents) word_list # Not shown in any particular order # #### Note that many common words get dropped... # In[13]: internal_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename}) internal_id # In[16]: # Index the chosen words for this first Content Item FullTextIndexing.new_indexing(internal_id = internal_id, unique_words = word_list) # #### Process the 2nd Content Item # In[17]: filename = "test2.htm" # 2nd FILE file_contents = MediaManager.get_from_text_file(path=MY_FOLDER, filename=filename) file_contents # In[18]: word_list = FullTextIndexing.extract_unique_good_words(file_contents) word_list # In[19]: internal_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename}) internal_id # In[20]: # Index the chosen words for this 2nd Content Item FullTextIndexing.new_indexing(internal_id = internal_id, unique_words = word_list) # _Here's what we have created so far (Note: **THE INDEXED WORDS MIGHT VARY, BASED ON THE LATEST LIST OF COMMON WORDS TO DROP**):_ # ![Full Text Indexing](../BrainAnnex/docs/tutorial_full_text_indexing.png) # In[ ]: # ### The following function provides a simple way to search content that includes a given word in the index, for demonstration purpose; for actual use, please see the methods provided by the `FullTextIndexing` class # In[21]: def search_word(word :str) -> [str]: """ Look up any stored words that contains the requested one (ignoring case.) Then locate the Content Items that are indexed by any of those words. Return a list of the values of the "name" attributes in all the found Content Items """ q= f'''MATCH (w:Word)-[:occurs]->(:Indexer)<-[:has_index]-(ci:`Content Item`) WHERE w.name CONTAINS toLower('{word}') RETURN DISTINCT ci.name AS content_name ''' result = db.query(q, single_column="content_name") return result # In[ ]: # # Now, can finally try out some word searches # ### Using the search_word() function above: # In[22]: search_word("world") # ### Or using methods provided by the `FullTextIndexing` class: # In[23]: FullTextIndexing.search_word("world", all_properties=True) # In[ ]: # ### IMPORTANT: make sure to search for the word *STEMS*, in order to find all variants!! # For example, search for "potato" in order to find both "potato" and "potatoes". # In[24]: search_word("POTATO") # In[25]: search_word("POTATOES") # In[26]: search_word("Learn") # In[27]: search_word("Learning") # In[28]: search_word("Supercalifragili") # In[ ]: # ### Note: full-text indexing and search is also available as part of the UI of the web app that is included in the release of Brain Annex. # Currently supported: indexing of text files, HTML files (e.g., formatted notes) and PDF documents. # In[ ]: