#!/usr/bin/env python
# coding: utf-8

# ## Tutorial : Exploration of full-text indexing
# We'll read in some files, then index the "important" words in their contents, and finally search for some of those words
# 
# For more info and background info, please see:  
#         https://julianspolymathexplorations.blogspot.com/2023/08/full-text-search-neo4j-indexing.html

# In[1]:


import set_path      # Importing this module will add the project's home directory to sys.path


# In[2]:


import os
import sys
import getpass

from neoaccess import NeoAccess

from BrainAnnex.modules.neo_schema.neo_schema import NeoSchema
from BrainAnnex.modules.full_text_indexing.full_text_indexing import FullTextIndexing
from BrainAnnex.modules.media_manager.media_manager import MediaManager


# # Connect to the database
# #### You can use a free local install of the Neo4j database, or a remote one on a virtual machine under your control, or a hosted solution, or simply the FREE "Sandbox" : [instructions here](https://julianspolymathexplorations.blogspot.com/2023/03/neo4j-sandbox-tutorial-cypher.html)
# NOTE: This tutorial is tested on version 4 of the Neo4j database, but will probably also work on the new version 5# Connect to the database

# In[3]:


# Save your credentials here - or use the prompts given by the next cell
host = ""             # EXAMPLES:  bolt://123.456.789.012   OR   neo4j://localhost
password = ""


# In[4]:


print("To create a database connection, enter the host IP, but leave out the port number: (EXAMPLES:  bolt://123.456.789.012  OR  neo4j://localhost )\n")

host = input("Enter host IP WITHOUT the port number.  EXAMPLE: bolt://123.456.789.012 ")
host += ":7687"    # EXAMPLE of host value:  "bolt://123.456.789.012:7687"

password = getpass.getpass("Enter the database password:")

print(f"\n=> Will be using: host='{host}', username='neo4j', password=**********")


# In[ ]:


# In[5]:


db = NeoAccess(host=host,
               credentials=("neo4j", password), debug=False)   # Notice the debug option being OFF


# In[6]:


print("Version of the Neo4j driver: ", db.version())


# # Explorations of Indexing

# In[7]:


# Verify that the database is empty  (if necessary, use db.empty_dbase()  to clear it)
q = "MATCH (n) RETURN COUNT(n) AS number_nodes"

db.query(q, single_cell="number_nodes")


# #### Initialize the indexing

# In[8]:


NeoSchema.set_database(db)
FullTextIndexing.db = db


# In[9]:


MediaManager.set_media_folder("D:/tmp/")   # CHANGE AS NEEDED on your system


# In[10]:


db.empty_dbase()                           # WARNING: USE WITH CAUTION!!!


# In[11]:


FullTextIndexing.initialize_schema()


# In[ ]:


# #### Read in 2 files (stored in the "media folder" specified above), and index them

# In[12]:


filename = "test1.txt"      # 1st FILE
file_contents = MediaManager.get_from_file(filename)
file_contents


# In[13]:


word_list = FullTextIndexing.extract_unique_good_words(file_contents)
word_list


# #### Note that many common words get dropped...

# In[14]:


content_item_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename})
content_item_id


# In[15]:


# Index the chosen words for this first Content Item
FullTextIndexing.new_indexing(content_item_id = content_item_id, unique_words = word_list)


# #### Process the 2nd Content Item

# In[16]:


filename = "test2.htm"     # 2nd FILE
file_contents = MediaManager.get_from_file(filename)
file_contents


# In[17]:


word_list = FullTextIndexing.extract_unique_good_words(file_contents)
word_list


# In[18]:


content_item_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename})
content_item_id


# In[19]:


# Index the chosen words for this 2nd Content Item
FullTextIndexing.new_indexing(content_item_id = content_item_id, unique_words = word_list)


# _Here's what we have created so far:_

# ![Full Text Indexing](../BrainAnnex/docs/tutorial_full_text_indexing.png)

# In[ ]:


# ### The following function provides a simple way to search content that includes a given word in the index, for demonstration purpose; for actual use, please see the methods provided by the `FullTextIndexing` class

# In[20]:


def search_word(word :str) -> [str]:
    """
    Look up any stored words that contains the requested one (ignoring case.)  
    Then locate the Content Items that are indexed by any of those words.
    Return a list of the values of the "name" attributes in all the found Content Items
    """
    q= f'''MATCH (w:Word)-[:occurs]->(:Indexer)<-[:has_index]-(ci:`Content Item`)
         WHERE w.name CONTAINS toLower('{word}')
         RETURN DISTINCT ci.name AS content_name
         '''
    result = db.query(q, single_column="content_name")
    return result


# In[ ]:


# # Now, can finally try out some word searches

# ### Using the search_word() function above:

# In[21]:


search_word("hello")


# In[22]:


search_word("world")


# ### Or using methods provided by the `FullTextIndexing` class:

# In[23]:


FullTextIndexing.search_word("world", all_properties=True)


# In[ ]:


# ### IMPORTANT: make sure to search for the word STEMS, in order to find all variants!!
# For example, search for "potato" in order to find both "potato" and "potatoes".

# In[24]:


search_word("POTATO")


# In[25]:


search_word("POTATOES")


# In[26]:


search_word("Learn")


# In[27]:


search_word("Learning")


# In[ ]: