import pandas as pd
import os
import numpy as np
import shutil
import json
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit
from decord import VideoReader
def get_file_stem(path):
base=os.path.basename(path)
return os.path.splitext(base)[0]
root_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Participants"
participants = os.listdir(root_path)
processed_participants = ['1205a',
'0205b',
'0605b',
'2504g',
'020419c',
'0705b',
'2504d',
'020419f',
'0905a',
'2204c',
'0205e',
'010419c',
'1205b',
'2504e',
'0404b',
'1105a',
'0905b',
'2304b',
'1105d',
'2604a',
'1105c',
'1005c',
'1005b',
'1105e',
'2504c',
'020419e']
final_participants= [participant for participant in participants if not(participant in processed_participants)]
new_root = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dartaset_Pictures"
#get participants with videos
video_folder = "ArtworkClips"
dfs = {"video_path":[],"label":[],"frames":[]}
for participant in tqdm(final_participants):
cd = os.path.join(root_path,participant)
# dest_dir = os.path.join(new_root,participant,video_folder)
if(video_folder in os.listdir(cd)):
# if(not(os.path.isdir(dest_dir))):
# os.makedirs(dest_dir)
# #copy folder to new_dataset_folder
# dir_to_copy = os.path.join(cd,video_folder)
# for file in os.listdir(dir_to_copy):
# dest = os.path.join(dest_dir,file)
# file_dir =os.path.join(dir_to_copy,file)
# shutil.copyfile(file_dir,dest)
try:
df_path = os.path.join(cd,"FullDataset",f"video_dataset_{participant}.txt")
df = json.load(open(df_path))
for key in dfs.keys():
dfs[key].append(df[key])
except:
print(f"{participant} not processed")
for key in dfs.keys():
dfs[key]= np.concatenate(dfs[key])
100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 42.16it/s]
df = pd.DataFrame.from_dict(dfs)
df["participant"] = df["video_path"].apply(lambda row: row.split("/")[0])
df
| video_path | label | frames | participant | |
|---|---|---|---|---|
| 0 | 010419e/ArtworkClips/010419e_10.mp4 | 15 | 280 | 010419e |
| 1 | 010419e/ArtworkClips/010419e_11.mp4 | 4 | 341 | 010419e |
| 2 | 010419e/ArtworkClips/010419e_12.mp4 | 4 | 347 | 010419e |
| 3 | 010419e/ArtworkClips/010419e_14.mp4 | 5 | 295 | 010419e |
| 4 | 010419e/ArtworkClips/010419e_17.mp4 | 0 | 354 | 010419e |
| ... | ... | ... | ... | ... |
| 2030 | 3004e/ArtworkClips/3004e_93.mp4 | 16 | 292 | 3004e |
| 2031 | 3004e/ArtworkClips/3004e_94.mp4 | 16 | 278 | 3004e |
| 2032 | 3004e/ArtworkClips/3004e_95.mp4 | 16 | 287 | 3004e |
| 2033 | 3004e/ArtworkClips/3004e_96.mp4 | 0 | 295 | 3004e |
| 2034 | 3004e/ArtworkClips/3004e_98.mp4 | 16 | 289 | 3004e |
2035 rows × 4 columns
import joblib
encoder_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dataset_meta\encoder.pkl"
labels = df["label"].values
encoder = LabelEncoder().fit(labels)
labels_enc = encoder.transform(labels)
joblib.dump(encoder,encoder_path)
['C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Video_Dataset_meta\\encoder.pkl']
labels_enc = encoder.transform(df["label"])
df["label"] = labels_enc
subsets_ids = [(id1,id1+8) for id1 in range(0,len(final_participants),8)]
def get_subset_df(df,participant_subset):
idx_list = []
for idx in df.index:
if(df.loc[idx,"participant"] in participant_subset):
idx_list.append(idx)
df["video_path"] = df["video_path"].apply(lambda row: get_file_stem(row))
return df.loc[idx_list,["video_path","label","frames"]]
def df_to_txt(df,dir_path):
df.to_csv(dir_path, header=None, index=None, sep=' ', mode='a')
base_dir = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Frames_Dataset_AL"
for idx, (id1,id2) in enumerate(subsets_ids):
subset_path = os.path.join(base_dir,f"Dataset_v{idx}")
if not(os.path.isdir(subset_path)):
os.makedirs(subset_path)
dataset_file = os.path.join(subset_path,"Train.txt")
df_subset = get_subset_df(df,final_participants[id1:id2])
df_to_txt(df_subset,dataset_file)
splits[split]
splits = {"train":train_index,"val":val_index,"test":test_index}
root_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dataset_meta"
for split in splits.keys():
df_split = df.loc[splits[split]]
dir_path = os.path.join(root_path,f"{split}.txt")
df_to_txt(df_split,dir_path)
df.loc[splits["train"]]
test_video = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dartaset_Pictures\0205e\ArtworkClips\0205e_3.mp4"
vr = VideoReader(test_video)
n_frames = len(vr)
base_test = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dartaset_Pictures\0205e\test"
from PIL import Image
#read df
df_path = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Datasets\Video_Dataset_meta\test.txt"
df = pd.read_csv(df_path,sep=" ",header= None)
df.columns = ["video_path","label","frames"]
def get_file_stem(path):
base=os.path.basename(path)
return os.path.splitext(base)[0]
root = r"C:\Users\jeuux\Desktop\Carrera\MoAI\TFM\AnnotatedData\FinalDatasets\Participants"