#!/usr/bin/env python # coding: utf-8 # In[ ]: import pandas as pd import os import numpy as np import shutil import json from tqdm import tqdm import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit from decord import VideoReader # In[72]: def get_file_stem(path): base=os.path.basename(path) return os.path.splitext(base)[0] # In[ ]: root_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Participants" participants = os.listdir(root_path) # In[ ]: processed_participants = ['1205a', '0205b', '0605b', '2504g', '020419c', '0705b', '2504d', '020419f', '0905a', '2204c', '0205e', '010419c', '1205b', '2504e', '0404b', '1105a', '0905b', '2304b', '1105d', '2604a', '1105c', '1005c', '1005b', '1105e', '2504c', '020419e'] # In[49]: final_participants= [participant for participant in participants if not(participant in processed_participants)] # In[20]: new_root = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dartaset_Pictures" # In[50]: #get participants with videos video_folder = "ArtworkClips" dfs = {"video_path":[],"label":[],"frames":[]} for participant in tqdm(final_participants): cd = os.path.join(root_path,participant) # dest_dir = os.path.join(new_root,participant,video_folder) if(video_folder in os.listdir(cd)): # if(not(os.path.isdir(dest_dir))): # os.makedirs(dest_dir) # #copy folder to new_dataset_folder # dir_to_copy = os.path.join(cd,video_folder) # for file in os.listdir(dir_to_copy): # dest = os.path.join(dest_dir,file) # file_dir =os.path.join(dir_to_copy,file) # shutil.copyfile(file_dir,dest) try: df_path = os.path.join(cd,"FullDataset",f"video_dataset_{participant}.txt") df = json.load(open(df_path)) for key in dfs.keys(): dfs[key].append(df[key]) except: print(f"{participant} not processed") for key in dfs.keys(): dfs[key]= np.concatenate(dfs[key]) # In[51]: df = pd.DataFrame.from_dict(dfs) # In[60]: df["participant"] = df["video_path"].apply(lambda row: row.split("/")[0]) # In[61]: df # In[41]: import joblib encoder_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dataset_meta\encoder.pkl" labels = df["label"].values encoder = LabelEncoder().fit(labels) labels_enc = encoder.transform(labels) joblib.dump(encoder,encoder_path) # In[53]: labels_enc = encoder.transform(df["label"]) df["label"] = labels_enc # In[59]: subsets_ids = [(id1,id1+8) for id1 in range(0,len(final_participants),8)] # In[73]: def get_subset_df(df,participant_subset): idx_list = [] for idx in df.index: if(df.loc[idx,"participant"] in participant_subset): idx_list.append(idx) df["video_path"] = df["video_path"].apply(lambda row: get_file_stem(row)) return df.loc[idx_list,["video_path","label","frames"]] def df_to_txt(df,dir_path): df.to_csv(dir_path, header=None, index=None, sep=' ', mode='a') # In[74]: base_dir = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Frames_Dataset_AL" for idx, (id1,id2) in enumerate(subsets_ids): subset_path = os.path.join(base_dir,f"Dataset_v{idx}") if not(os.path.isdir(subset_path)): os.makedirs(subset_path) dataset_file = os.path.join(subset_path,"Train.txt") df_subset = get_subset_df(df,final_participants[id1:id2]) df_to_txt(df_subset,dataset_file) # In[ ]: splits[split] # In[ ]: splits = {"train":train_index,"val":val_index,"test":test_index} root_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dataset_meta" for split in splits.keys(): df_split = df.loc[splits[split]] dir_path = os.path.join(root_path,f"{split}.txt") df_to_txt(df_split,dir_path) # In[ ]: df.loc[splits["train"]] # In[ ]: test_video = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dartaset_Pictures\0205e\ArtworkClips\0205e_3.mp4" # In[ ]: vr = VideoReader(test_video) # In[ ]: n_frames = len(vr) base_test = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dartaset_Pictures\0205e\test" # In[ ]: from PIL import Image # In[ ]: #read df df_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dataset_meta\test.txt" df = pd.read_csv(df_path,sep=" ",header= None) # In[ ]: df.columns = ["video_path","label","frames"] # In[ ]: def get_file_stem(path): base=os.path.basename(path) return os.path.splitext(base)[0] root = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Participants"