#!/usr/bin/env python # coding: utf-8 # ## `NeoAccess` library - Tutorial : Importing data into Neo4j from a Pandas data frame, # #### using the `load_pandas()` method # # #### [Overview and Intro article](https://julianspolymathexplorations.blogspot.com/2023/06/neo4j-python-neoaccess-library.html) # # #### [Reference guide](https://brainannex.org/docs/neoaccess.htm) # # #### CAUTION: running this tutorial will clear out the database # In[1]: import os import sys import getpass import pandas as pd from neoaccess import NeoAccess # In case of problems, try a sys.path.append(directory) , where directory is your project's root directory # # 1. Connecting to the database # #### You can use a free local install of the Neo4j database, or a remote one on a virtual machine under your control, or a hosted solution # NOTE: This tutorial is tested on **version 4.4** of the Neo4j database, but will probably also work on the new version 5 (NOT guaranteed, however...) # In[2]: # Save your credentials here - or use the prompts given by the next cell host = "" # EXAMPLES: bolt://123.456.789.012 OR neo4j://localhost # (CAUTION: do NOT include the port number!) password = "" # In[3]: db = NeoAccess(host=host, credentials=("neo4j", password)) # In[4]: print("Version of the Neo4j driver: ", db.version()) # In[ ]: # In[ ]: # # 2. Importing data into an empty database from a Pandas dataframe # ### In the following example, the database entities are EMPLOYEES at a company # In[5]: # CLEAR OUT THE DATABASE #db.empty_dbase() # UNCOMMENT IF DESIRED ***************** WARNING: USE WITH CAUTION!!! ************************ # In[6]: # Prepare a Pandas data frame with the data. (In some use cases, the data frame would be read in from a CSV file) df = pd.DataFrame({"employee ID": [100, 200, 300], "active": [True, True, True], "name": ["John Doe", "Valerie Leben", "Jill Smith"], "job title": ["administrative assistant", "CEO", "head of marketing"], "salary": [80000, 400000, 250000] }) df # In[7]: # Always a good idea to make sure that the data types are what we need them to be, # especially if your dataframe originated from a CSV file. # Note: it's normal for strings show up as "object" in a Pandas dataframe (because of their variable length) df.dtypes # In[8]: # Now import the data into the database. # We'll use the label "Employee". Note: blanks are allowed, and so are multiple labels (use a list or tuple) internal_ids = db.load_pandas(df, labels="Employee") internal_ids # This is a list of the Neo4j ID's assigned to the new nodes # ## Voila', as simple as that! Here's what the database looks like at this stage: # ![Three imported records](https://raw.githubusercontent.com/BrainAnnex/brain-annex/main/docs/tutorials_neoaccess_load_pandas.jpg) # #### Now, retrieve the newly-created nodes # In[9]: match_specs = db.match(labels="Employee") # Our specifications to later locate nodes by (here, by using labels) # NO DATABASE OPERATION PERFORMED HERE print(match_specs) # In[10]: db.get_nodes(match_specs) # #### Side note - If APOC is installed in the database, we can verify that the **data types** got correctly imported by issuing the following Cyher command: # In[11]: q = '''MATCH (n :Employee) RETURN apoc.meta.cypher.types(n) LIMIT 1 ''' db.query(q) # Run the Cypher query # In[ ]: # In[ ]: # # 3. Adding data to an existing database from a Pandas dataframe (keeping existing fields) # In[12]: # Let's review our old, original data frame df # #### Now, consider a NEW dataframe with changes/updates # This dataframe spells out the details of Jill's raise: # In[13]: df_update_1 = pd.DataFrame({"employee ID": [300], "salary": [270000] }) df_update_1 # #### The new data can be MERGED into the existing data, by using the "employee ID" for matching against existing records # Notice the use of the argument `merge_primary_key="employee ID"` # In[14]: db.load_pandas(df_update_1, labels="Employee", merge_primary_key="employee ID", merge_overwrite=False) # merge_overwrite=False means updating the record rather then completely over-writing it, # i.e. "KEEP EXISTING DATA IN FIELDS NOT SPECIFIED IN THE DATAFRAME" # In[15]: db.get_nodes(match_specs) # The specs haven't changed: we still want to "locate all nodes with the "Employee" label # #### Notice how Jill got her raise, and her other fields were left untouched # In[ ]: # In[ ]: # # 4. Adding data to an existing database from a Pandas dataframe (completely over-writing records) # #### Let's say that John is retiring. We may want to keep the record with the employee ID and the name, but flip the "active" field to false, and DROP all other fields # This can be done by importing the following dataframe, again with `merge_primary_key="employee ID"` BUT this time with the `merge_overwrite=True` option # In[16]: df_update_2 = pd.DataFrame({"employee ID": [100], "active": [False], "name": ["John Doe"] }) df_update_2 # In[17]: db.load_pandas(df_update_2, labels="Employee", merge_primary_key="employee ID", merge_overwrite=True) # merge_overwrite=True means blanking out the record and then re-building it, # i.e. "DON'T KEEP EXISTING DATA IN FIELDS NOT SPECIFIED IN THE DATAFRAME" # In[18]: db.get_nodes(match_specs) # The specs haven't changed: we still want to "locate all nodes with the "Employee" label # ### Notice how John's old record got ditched, and COMPLETELY REPLACED with the new data (containing fewer fields) # Nothing else got modified # In[ ]: # # For more options to use with `load_pandas()`, please see the [reference guide](https://brainannex.org/docs/neoaccess.htm) # In[ ]: