import os
import pandas as pd
import numpy as np
import seaborn as sns
targets_path=r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\Accelerometer_Data\Datasets\HAR_Dataset\targets.npy"
dataset_path=r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\Accelerometer_Data\Datasets\HAR_Dataset\data.npy"
from sklearn.model_selection import train_test_split
#Train/val/test
def split_data(data,targets,test_size):
train_idx,test_idx=train_test_split(range(len(data)),test_size=test_size,random_state=42)
samples_train=data[train_idx]
targets_train=targets[train_idx]
samples_test=data[test_idx]
targets_test=targets[test_idx]
return samples_train,targets_train,samples_test,targets_test
def save_data(folder,data,targets):
np.save(os.path.join(folder,"data"),data)
np.save(os.path.join(folder,"targets"),targets)
samples_train,targets_train,samples_test,targets_test=split_data(data,targets,0.2)
samples_train,targets_train,samples_val,targets_val=split_data(samples_train,targets_train,0.1)
train_folder=r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\Accelerometer_Data\Datasets\HAR_Dataset_raw\Train"
test_folder=r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\Accelerometer_Data\Datasets\HAR_Dataset_raw\Test"
val_folder=r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\Accelerometer_Data\Datasets\HAR_Dataset_raw\Val"
save_data(train_folder,samples_train,targets_train)
save_data(test_folder,samples_test,targets_test)
save_data(val_folder,samples_val,targets_val)
participants_dir = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\Accelerometer_Data\Participants"
participant_folders = [os.path.join(participants_dir,participant) for participant in os.listdir(participants_dir)]
len(participant_folders)
32
participants_train = participant_folders[:25]
participants_val = participant_folders[25:]
# participants_val = participant_folders[25:30]
# participants_test = participant_folders[-5:]
def get_file_stem(path):
base=os.path.basename(path)
return os.path.splitext(base)[0]
def read_metadata(df_path):
#read df
df = pd.read_csv(df_path,sep=" ",header= None)
df.columns = ["video_path","frames","label"]
return df
def df_to_txt(df,dir_path):
df.to_csv(dir_path, header=None, index=None, sep=' ', mode='a')
participants_val
def get_data(data_folders,subset,dataset_dir):
df_dict = {
"video_path": [],
"label": [],
"frames": []
}
for folder in data_folders:
participant = get_file_stem(folder)
df_path = os.path.join(folder,"FullDataset",f"HARClips_dataset_{participant}.txt")
with open(df_path,"rb") as f:
df = json.load(f)
for k,v in df.items():
df_dict[k] = df_dict[k] + v
df_path = os.path.join(dataset_dir,f"{subset}.txt")
# with open(df_path,"w") as f:
# json.dump(df_dict,f)
df_to_txt(pd.DataFrame.from_dict(df_dict),df_path)
print("Finished")
# segments_file = os.path.join(folder,"FullDataset","segments_data.npy")
# labels_file = os.path.join(folder,"FullDataset","targets_data.npy")
# segments.append(np.load(segments_file))
# labels.append(np.load(labels_file))
# segments = np.concatenate(segments)
# labels = np.concatenate(labels)
# if not(os.path.isdir(os.path.join(dataset_dir,subset))):
# os.makedirs(os.path.join(dataset_dir,subset))
# np.save(os.path.join(dataset_dir,subset,"data"),segments)
# np.save(os.path.join(dataset_dir,subset,"targets"),labels)
def get_data(data_folders,subset,dataset_dir,dataset_root):
segments = []
labels = []
for folder in data_folders:
participant = get_file_stem(folder)
segments_file = os.path.join(folder,dataset_root,"segments_data.npy")
labels_file = os.path.join(folder,dataset_root,"targets_data.npy")
segments.append(np.load(segments_file))
labels.append(np.load(labels_file))
segments = np.concatenate(segments)
labels = np.concatenate(labels)
if not(os.path.isdir(os.path.join(dataset_dir,subset))):
os.makedirs(os.path.join(dataset_dir,subset))
np.save(os.path.join(dataset_dir,subset,"data"),segments)
np.save(os.path.join(dataset_dir,subset,"targets"),labels)
folders = [participants_train,participants_val]
subsets =["Train","Val"]
dataset_root = "CompleteData"
dataset_dir = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\Accelerometer_Data\Datasets\HAR_Dataset_participants"
for folders,subset in zip(folders,subsets):
test = get_data(folders,subset,dataset_dir,dataset_root)
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
base_dir = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\Accelerometer_Data\Datasets\HAR_Dataset_participants\Train"
targets = np.load(os.path.join(base_dir,"targets.npy"))
data = np.load(os.path.join(base_dir,"data.npy"))
encoder = LabelEncoder().fit(targets)
scaler = StandardScaler().fit(data.reshape(-1,9))
joblib.dump(encoder,os.path.join(base_dir,"encoder.pkl"))
joblib.dump(scaler,os.path.join(base_dir,"scaler.pkl"))
['C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Accelerometer_Data\\Datasets\\HAR_Dataset_participants\\Train\\scaler.pkl']
df_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Participants\0304d\FullDataset\HARClips_dataset_0304d.txt"
#read df
df = pd.read_csv(df_path,sep=" ",header= None)
# df.columns = ["video_path","frames","label"]
import json
with open(df_path,"rb") as f:
df= json.load(f)
df["video_path"]
dataset_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\HAR_dataset_v1"
folders = [participants_train,participants_test,participants_val]
subsets =["Train","Test","Val"]
for folders,subset in zip(folders,subsets):
test = get_data(folders,subset,dataset_path)
df = pd.DataFrame.from_dict(test)
df
base_dir = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\HAR_AL"
for index,(id1,id2) in enumerate(zip(range(30,69,5),range(35,71,5))):
if(id1==65):
id2 = 68
dataset_path = os.path.join(base_dir,f"HAR_dataset_AL_v{index}")
if not(os.path.isdir(dataset_path)):
os.makedirs(dataset_path)
participant_list = participant_folders[id1:id2]
get_data(participant_list,"Train",dataset_path)
#concat every subtable for each participant
participant_folders=[os.path.join(participant_folder,folder) for folder in os.listdir(participant_folder)]
for participant_folder in tqdm(participant_folders):
folder=os.path.join(participant_folder,data_type)
table_arr=[]
for table in os.listdir(folder):
table_arr.append(pd.read_csv(os.path.join(folder,table)))
df=pd.concat(table_arr)
participant=os.path.basename(participant_folder)
df.to_csv(os.path.join(folder,"{0}_raw_{1}.csv".format(data_type,participant)),index=False)
print("Save table of participant : {}".format(participant))