Skip to content

Commit 2e4100b

Browse files
authored
Update author information in Dataset_Gen.py
1 parent 1f5b982 commit 2e4100b

1 file changed

Lines changed: 187 additions & 0 deletions

File tree

Dataset_Gen.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
#!/usr/bin/python3
2+
# -*- coding:utf-8 -*-
3+
# @author: lxj
4+
# @description: A unified script to generate the NETD (Dynamic Non-I.I.D.
5+
# Encrypted Traffic Dataset) based on the ISCX-VPN dataset.
6+
# This script implements both Proportional Bias (for NETD-1, NETD-2)
7+
# and Compositional Bias (for NETD-3, NETD-4) strategies as
8+
# described in the paper "Respond to Change with Constancy...".
9+
10+
import os
11+
import random
12+
import shutil
13+
import math
14+
15+
def create_proportional_bias_dataset(base_path: str, output_path: str, dominant_ratio: int):
16+
"""
17+
Constructs an O.O.D. dataset by creating a proportional bias between a randomly
18+
selected "dominant" application and other "minor" applications within each service class.
19+
20+
This method is used for generating NETD-1 and NETD-2.
21+
22+
Args:
23+
base_path (str): The path to the source ISCX-VPN dataset, containing service class folders.
24+
output_path (str): The path where the generated dataset will be saved.
25+
dominant_ratio (int): The ratio of dominant to minor samples. For a 1:3 dominant-to-minor
26+
sample count, this value should be 3. For 3:1, it should be 1/3.
27+
"""
28+
print(f"--- Creating Proportional Bias Dataset at {output_path} ---")
29+
30+
# Total samples to draw for the training set's dominant component
31+
total_train_dominant_samples = 400
32+
# Samples to draw for the test set (fixed 1:1 ratio)
33+
total_test_samples = 100
34+
35+
service_labels = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
36+
37+
for label in service_labels:
38+
print(f"Processing service class: {label}...")
39+
label_path = os.path.join(base_path, label)
40+
applications = [app for app in os.listdir(label_path) if os.path.isdir(os.path.join(label_path, app))]
41+
42+
if not applications:
43+
continue
44+
45+
dominant_app = random.choice(applications)
46+
minor_apps = [app for app in applications if app != dominant_app]
47+
48+
# Aggregate all file paths for dominant and minor applications
49+
dominant_files = [os.path.join(root, file) for root, _, files in os.walk(os.path.join(label_path, dominant_app)) for file in files]
50+
minor_files = [os.path.join(root, file) for app in minor_apps for root, _, files in os.walk(os.path.join(label_path, app)) for file in files]
51+
52+
# Handle cases with only one application or insufficient minor files
53+
if len(applications) == 1:
54+
minor_files = dominant_files.copy()
55+
56+
if not minor_files:
57+
print(f"Warning: No minor files for label {label}. Using dominant files as minor.")
58+
minor_files = dominant_files.copy()
59+
60+
# --- Sample for Training Set ---
61+
train_dominant_samples = random.sample(dominant_files, min(total_train_dominant_samples, len(dominant_files)))
62+
63+
num_minor_samples = int(total_train_dominant_samples / dominant_ratio)
64+
train_minor_samples = random.sample(minor_files, min(num_minor_samples, len(minor_files)))
65+
66+
# --- Sample for Test Set (ensuring no overlap with training set) ---
67+
remaining_dominant = list(set(dominant_files) - set(train_dominant_samples))
68+
remaining_minor = list(set(minor_files) - set(train_minor_samples))
69+
70+
test_dominant_samples = random.sample(remaining_dominant, min(total_test_samples, len(remaining_dominant)))
71+
test_minor_samples = random.sample(remaining_minor, min(total_test_samples, len(remaining_minor)))
72+
73+
# --- Save the sampled files ---
74+
for split, samples in [("train", train_dominant_samples + train_minor_samples), ("test", test_dominant_samples + test_minor_samples)]:
75+
for file_path in samples:
76+
dest_dir = os.path.join(output_path, split, label)
77+
os.makedirs(dest_dir, exist_ok=True)
78+
shutil.copy(file_path, dest_dir)
79+
80+
print(f"--- Proportional Bias Dataset created successfully. ---\n")
81+
82+
83+
def create_compositional_bias_dataset(base_path: str, output_path: str, service_components: dict, training_ratio: float):
84+
"""
85+
Constructs an O.O.D. dataset by creating a compositional bias, where the training
86+
set contains only a subset of applications for each service class, while the test set
87+
contains all of them.
88+
89+
This method is used for generating NETD-3 and NETD-4.
90+
91+
Args:
92+
base_path (str): The path to the source ISCX-VPN dataset.
93+
output_path (str): The path where the generated dataset will be saved.
94+
service_components (dict): A mapping of service names to a list of their application folder names.
95+
training_ratio (float): The percentage of applications to include in the training set (e.g., 0.8 for 80%).
96+
"""
97+
print(f"--- Creating Compositional Bias Dataset at {output_path} ---")
98+
99+
for service, apps in service_components.items():
100+
print(f"Processing service class: {service}...")
101+
102+
# --- Determine Training Set Composition ---
103+
num_apps_for_training = math.ceil(len(apps) * training_ratio)
104+
training_apps = random.sample(apps, num_apps_for_training)
105+
106+
print(f" Training with {len(training_apps)}/{len(apps)} apps: {training_apps}")
107+
108+
# --- Copy files for Training Set ---
109+
for app in training_apps:
110+
app_path = os.path.join(base_path, service, app)
111+
if os.path.exists(app_path):
112+
for root, _, files in os.walk(app_path):
113+
for file in files:
114+
dest_dir = os.path.join(output_path, "train", service)
115+
os.makedirs(dest_dir, exist_ok=True)
116+
shutil.copy(os.path.join(root, file), dest_dir)
117+
118+
# --- Copy files for Test Set (all applications) ---
119+
for app in apps:
120+
app_path = os.path.join(base_path, service, app)
121+
if os.path.exists(app_path):
122+
for root, _, files in os.walk(app_path):
123+
for file in files:
124+
dest_dir = os.path.join(output_path, "test", service)
125+
os.makedirs(dest_dir, exist_ok=True)
126+
shutil.copy(os.path.join(root, file), dest_dir)
127+
128+
print(f"--- Compositional Bias Dataset created successfully. ---\n")
129+
130+
131+
if __name__ == '__main__':
132+
# --- CONFIGURATION ---
133+
# !!! PLEASE UPDATE THESE PATHS !!!
134+
# Path to the extracted ISCX-VPN dataset, which should have subdirectories for each service
135+
# (e.g., .../iscx_vpn_dataset/Chat/, .../iscx_vpn_dataset/Email/)
136+
ISCX_BASE_PATH = "path/to/your/iscx_vpn_dataset"
137+
138+
# Path where the generated NETD datasets will be stored.
139+
NETD_OUTPUT_PATH = "path/to/save/NETD"
140+
141+
# Application mapping based on the paper and ISCX-VPN dataset structure.
142+
# Note: Folder names must match those in your ISCX_BASE_PATH.
143+
SERVICE_APP_MAPPING = {
144+
'Chat': ['AIMchat', 'facebookchat', 'hangoutschat', 'icqchat', 'skypechat'],
145+
'Email': ['gmail', 'imap', 'pop', 'smtp'],
146+
'File Transfer': ['ftps', 'sftp', 'skypefile'],
147+
'P2P': ['bittorrent'],
148+
'Streaming': ['vimeo', 'youtube'],
149+
'VoIP': ['facebookvoip', 'hangoutsvoip', 'skypevoip']
150+
}
151+
152+
if not os.path.isdir(ISCX_BASE_PATH) or not os.path.isdir(NETD_OUTPUT_PATH):
153+
print("Error: Please update ISCX_BASE_PATH and NETD_OUTPUT_PATH to valid directories.")
154+
else:
155+
# --- GENERATE DATASETS ---
156+
157+
# Generate NETD-1: Proportional bias with a 1:3 dominant-to-minor ratio[cite: 610].
158+
# N_minor = N_dominant / ratio. For N_dom:N_min=1:3, ratio=3.
159+
create_proportional_bias_dataset(
160+
base_path=ISCX_BASE_PATH,
161+
output_path=os.path.join(NETD_OUTPUT_PATH, "NETD-1"),
162+
dominant_ratio=3
163+
)
164+
165+
# Generate NETD-2: Proportional bias with a 3:1 dominant-to-minor ratio[cite: 611].
166+
# For N_dom:N_min=3:1, ratio=1/3.
167+
create_proportional_bias_dataset(
168+
base_path=ISCX_BASE_PATH,
169+
output_path=os.path.join(NETD_OUTPUT_PATH, "NETD-2"),
170+
dominant_ratio=(1/3)
171+
)
172+
173+
# Generate NETD-3: Compositional bias with 80% of apps in the training set[cite: 612, 613].
174+
create_compositional_bias_dataset(
175+
base_path=ISCX_BASE_PATH,
176+
output_path=os.path.join(NETD_OUTPUT_PATH, "NETD-3"),
177+
service_components=SERVICE_APP_MAPPING,
178+
training_ratio=0.8
179+
)
180+
181+
# Generate NETD-4: Compositional bias with 20% of apps in the training set[cite: 615].
182+
create_compositional_bias_dataset(
183+
base_path=ISCX_BASE_PATH,
184+
output_path=os.path.join(NETD_OUTPUT_PATH, "NETD-4"),
185+
service_components=SERVICE_APP_MAPPING,
186+
training_ratio=0.2
187+
)

0 commit comments

Comments
 (0)