Skip to content

Commit 1b78e5e

Browse files
authored
Add shuffling of files
1 parent 63778ed commit 1b78e5e

File tree

1 file changed

+240
-0
lines changed

1 file changed

+240
-0
lines changed

shuffle/shuffle.cpp

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
#include <iostream>
2+
#include <fstream>
3+
#include <vector>
4+
#include <algorithm> // for std::shuffle
5+
#include <random> // for std::default_random_engine
6+
#include <sstream> // for std::ostringstream
7+
#include <numeric> // for std::iota
8+
#include <chrono>
9+
#include <execution>
10+
11+
struct Position
12+
{
13+
uint64_t occupancy;
14+
uint64_t pieces_lo;
15+
uint64_t pieces_hi;
16+
int16_t score;
17+
uint8_t result;
18+
uint8_t stm_king;
19+
uint8_t nstm_king;
20+
};
21+
22+
constexpr std::size_t CHUNK_SIZE = 16384 * 32;
23+
24+
void saveChunkToFile(const std::vector<Position>& chunk, const std::string& filename) {
25+
std::ofstream file(filename, std::ios::binary | std::ios::out);
26+
if (!file.is_open()) {
27+
std::cerr << "Error opening file: " << filename << std::endl;
28+
return;
29+
}
30+
31+
file.write(reinterpret_cast<const char*>(chunk.data()), chunk.size() * sizeof(Position));
32+
file.close();
33+
}
34+
35+
std::string getTempFilename(std::size_t index) {
36+
return "temp_chunk_" + std::to_string(index) + ".bin";
37+
}
38+
39+
void deleteFile(const std::string& filename)
40+
{
41+
if (std::remove(filename.c_str()) != 0)
42+
{
43+
std::cerr << "Error deleting file: " << filename << std::endl;
44+
}
45+
}
46+
47+
void interleaveChunks(const std::vector<Position>& chunk1, const std::vector<Position>& chunk2,
48+
std::vector<Position>& result, std::default_random_engine& generator)
49+
{
50+
std::bernoulli_distribution distribution(0.5);
51+
52+
// Interleave the contents of chunk1 and chunk2 based on Bernoulli distribution
53+
std::size_t i = 0, j = 0;
54+
while (i < chunk1.size() && j < chunk2.size())
55+
{
56+
if (distribution(generator))
57+
{
58+
result.push_back(chunk1[i++]);
59+
}
60+
else
61+
{
62+
result.push_back(chunk2[j++]);
63+
}
64+
}
65+
66+
// Add the remaining elements from chunk1, if any
67+
while (i < chunk1.size())
68+
{
69+
result.push_back(chunk1[i++]);
70+
}
71+
72+
// Add the remaining elements from chunk2, if any
73+
while (j < chunk2.size())
74+
{
75+
result.push_back(chunk2[j++]);
76+
}
77+
}
78+
79+
// Function to process the binary file and shuffle chunks
80+
void shuffleChunks(const std::string& inputFilename, std::vector<std::string>& tempFiles, std::size_t& chunkIndex)
81+
{
82+
std::ifstream input(inputFilename, std::ios::binary | std::ios::in);
83+
if (!input.is_open())
84+
{
85+
std::cerr << "Error opening input file" << std::endl;
86+
return;
87+
}
88+
89+
std::vector<Position> chunk(CHUNK_SIZE);
90+
91+
while (input.read(reinterpret_cast<char*>(chunk.data()), CHUNK_SIZE * sizeof(Position)))
92+
{
93+
std::random_device rd;
94+
std::mt19937 g(rd());
95+
std::shuffle(chunk.begin(), chunk.end(), g);
96+
97+
std::string tempFilename = getTempFilename(chunkIndex);
98+
saveChunkToFile(chunk, tempFilename);
99+
tempFiles.push_back(tempFilename);
100+
101+
chunk.clear();
102+
chunk.resize(CHUNK_SIZE);
103+
104+
chunkIndex++;
105+
}
106+
107+
chunk.resize(input.gcount() / sizeof(Position));
108+
109+
if (!chunk.empty())
110+
{
111+
std::random_device rd;
112+
std::mt19937 g(rd());
113+
std::shuffle(chunk.begin(), chunk.end(), g);
114+
115+
std::string tempFilename = getTempFilename(chunkIndex);
116+
saveChunkToFile(chunk, tempFilename);
117+
tempFiles.push_back(tempFilename);
118+
chunkIndex++;
119+
}
120+
121+
input.close();
122+
}
123+
124+
// Function to interleave chunks in parallel
125+
void interleaveChunksInParallel(std::vector<std::string>& tempFiles, std::size_t& chunkIndex) {
126+
std::random_device rd;
127+
std::mt19937 g(rd());
128+
129+
// Interleave files until only one file is left
130+
while (tempFiles.size() > 1) {
131+
std::vector<std::string> newTempFiles;
132+
std::vector<std::size_t> indices(tempFiles.size());
133+
std::iota(indices.begin(), indices.end(), 0);
134+
std::shuffle(indices.begin(), indices.end(), g);
135+
136+
// If there's an odd number of files, move the last one to newTempFiles
137+
if (indices.size() % 2 == 1)
138+
{
139+
newTempFiles.push_back(tempFiles[indices.back()]);
140+
indices.pop_back();
141+
}
142+
143+
// Create pairs of indices for interleaving
144+
std::vector<std::pair<std::size_t, std::size_t>> pairs;
145+
for (std::size_t i = 0; i < indices.size(); i += 2)
146+
{
147+
pairs.emplace_back(indices[i], indices[i + 1]);
148+
}
149+
150+
// Interleave files in parallel
151+
std::for_each(std::execution::par, pairs.begin(), pairs.end(), [&](const std::pair<size_t, size_t>& pair)
152+
{
153+
// Get the index of the current pair
154+
std::size_t index = std::distance(pairs.begin(), std::find(pairs.begin(), pairs.end(), pair));
155+
auto [index1, index2] = pair;
156+
157+
std::vector<Position> chunk1, chunk2, interleavedChunk;
158+
159+
std::ifstream file1(tempFiles[index1], std::ios::binary | std::ios::in);
160+
std::ifstream file2(tempFiles[index2], std::ios::binary | std::ios::in);
161+
162+
if (!file1.is_open() || !file2.is_open())
163+
{
164+
std::cerr << "Error opening input files for interleaving" << std::endl;
165+
return;
166+
}
167+
168+
// Interleave files into new temporary file
169+
std::string newTempFilename = getTempFilename(chunkIndex + index);
170+
newTempFiles.push_back(newTempFilename);
171+
172+
std::ofstream mergeFile(newTempFilename, std::ios::binary | std::ios::out);
173+
if (!mergeFile.is_open())
174+
{
175+
std::cerr << "Error opening file: " << newTempFilename << std::endl;
176+
return;
177+
}
178+
179+
// Continue interleaving until both input files are read completely
180+
while (true)
181+
{
182+
chunk1.resize(CHUNK_SIZE);
183+
chunk2.resize(CHUNK_SIZE);
184+
185+
file1.read(reinterpret_cast<char*>(chunk1.data()), CHUNK_SIZE * sizeof(Position));
186+
file2.read(reinterpret_cast<char*>(chunk2.data()), CHUNK_SIZE * sizeof(Position));
187+
188+
chunk1.resize(file1.gcount() / sizeof(Position));
189+
chunk2.resize(file2.gcount() / sizeof(Position));
190+
191+
if (chunk1.empty() && chunk2.empty())
192+
{
193+
break;
194+
}
195+
196+
interleaveChunks(chunk1, chunk2, interleavedChunk, g);
197+
198+
mergeFile.write(reinterpret_cast<const char*>(interleavedChunk.data()), interleavedChunk.size() * sizeof(Position));
199+
interleavedChunk.clear();
200+
}
201+
202+
mergeFile.close();
203+
file1.close();
204+
file2.close();
205+
206+
// Delete the old temporary files
207+
deleteFile(tempFiles[index1]);
208+
deleteFile(tempFiles[index2]);
209+
});
210+
211+
chunkIndex += pairs.size();
212+
tempFiles = std::move(newTempFiles);
213+
}
214+
}
215+
216+
int main(int argc, char* argv[]) {
217+
if (argc != 2) {
218+
std::cerr << "Usage: " << argv[0] << " <input_filename>" << std::endl;
219+
return 1;
220+
}
221+
222+
std::string inputFilename(argv[1]);
223+
224+
auto start_time = std::chrono::high_resolution_clock::now();
225+
226+
std::vector<std::string> tempFiles;
227+
std::size_t chunkIndex = 0;
228+
229+
shuffleChunks(inputFilename, tempFiles, chunkIndex);
230+
interleaveChunksInParallel(tempFiles, chunkIndex);
231+
232+
std::cout << "Final result saved in: " << tempFiles.front() << std::endl;
233+
234+
auto end_time = std::chrono::high_resolution_clock::now();
235+
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
236+
237+
std::cout << "Execution time: " << duration.count() << " milliseconds" << std::endl;
238+
239+
return 0;
240+
}

0 commit comments

Comments
 (0)