|
@@ -4,6 +4,7 @@
|
4
|
4
|
#include <time.h>
|
5
|
5
|
#include <unistd.h>
|
6
|
6
|
#include <string.h>
|
|
7
|
+#include <math.h>
|
7
|
8
|
#include <pthread.h>
|
8
|
9
|
|
9
|
10
|
struct matrix
|
|
@@ -177,13 +178,13 @@ s_matrix matrix_mult_parallel(s_matrix mat1, s_matrix mat2, unsigned thread_coun
|
177
|
178
|
mat = matrix_generate(mat1.m, mat2.n, 0);
|
178
|
179
|
unsigned scalars_count = mat1.m * mat2.n;
|
179
|
180
|
unsigned distribution[thread_count];
|
180
|
|
- unsigned scalar_start = 0;
|
181
|
|
- //unsigned scalar_start = distribution[0];
|
182
|
181
|
matrix_get_thread_scalars_distribution(scalars_count, thread_count, distribution);
|
|
182
|
+ //unsigned scalar_start = 0;
|
|
183
|
+ unsigned scalar_start = distribution[0];
|
183
|
184
|
pthread_t threads[thread_count];
|
184
|
|
- //threads[0] = 0;
|
|
185
|
+ threads[0] = 0;
|
185
|
186
|
|
186
|
|
- for (unsigned thread_number = 0; thread_number < thread_count; ++thread_number) {
|
|
187
|
+ for (unsigned thread_number = 1; thread_number < thread_count; ++thread_number) {
|
187
|
188
|
unsigned scalars_count = distribution[thread_number];
|
188
|
189
|
if (scalars_count > 0) {
|
189
|
190
|
threads[thread_number] = matrix_mult_parallel_launch_thread(mat1, mat2, mat, scalar_start, scalars_count, thread_number, 1);
|
|
@@ -194,7 +195,7 @@ s_matrix matrix_mult_parallel(s_matrix mat1, s_matrix mat2, unsigned thread_coun
|
194
|
195
|
}
|
195
|
196
|
}
|
196
|
197
|
|
197
|
|
- //matrix_mult_parallel_launch_thread(mat1, mat2, mat, 0, distribution[0], 0, 0);
|
|
198
|
+ matrix_mult_parallel_launch_thread(mat1, mat2, mat, 0, distribution[0], 0, 0);
|
198
|
199
|
|
199
|
200
|
for (unsigned thread_number = 0; thread_number < thread_count; ++thread_number) {
|
200
|
201
|
pthread_t thread = threads[thread_number];
|
|
@@ -237,7 +238,8 @@ void print_time(struct timespec* ts)
|
237
|
238
|
long us = (ts->tv_nsec / 1000) % 1000;
|
238
|
239
|
long ms = (ts->tv_nsec / 1000000) % 1000;
|
239
|
240
|
long s = (ts->tv_nsec / 1000000000) % 1000 + ts->tv_sec;
|
240
|
|
- printf("%3lds %3ldms %3ldus %3ldns", s, ms, us, ns);
|
|
241
|
+ long t = (s * 1000000000) + (ms * 1000000) + (us * 1000) + ns;
|
|
242
|
+ printf("%3lds %3ldms %3ldus %3ldns %12ld", s, ms, us, ns, t);
|
241
|
243
|
}
|
242
|
244
|
|
243
|
245
|
void test(unsigned size, unsigned thread_count)
|
|
@@ -256,6 +258,7 @@ void test(unsigned size, unsigned thread_count)
|
256
|
258
|
printf("%3dcpu %3dthreads %4d*%-4d ", get_cpu_count(), thread_count, size, size);
|
257
|
259
|
print_time(&time);
|
258
|
260
|
printf("\n");
|
|
261
|
+ fflush(stdout);
|
259
|
262
|
matrix_free(mat);
|
260
|
263
|
|
261
|
264
|
}
|
|
@@ -292,15 +295,18 @@ int main(void)
|
292
|
295
|
|
293
|
296
|
check();
|
294
|
297
|
|
295
|
|
- unsigned sizes[] = {10, 100};
|
296
|
|
- unsigned threads_count[] = {1, 4, 16, 64};
|
|
298
|
+ unsigned sizes[] = {10, 100, 1000, 2000, 5000};
|
|
299
|
+ unsigned threads_count = get_cpu_count();
|
|
300
|
+ if (threads_count < 64) {
|
|
301
|
+ threads_count = 64;
|
|
302
|
+ }
|
297
|
303
|
|
298
|
304
|
for (unsigned s = 0; s < sizeof(sizes) / sizeof(*sizes); ++s) {
|
299
|
305
|
unsigned size = sizes[s];
|
300
|
|
- test(size, 0);
|
301
|
|
- for (unsigned t = 0; t < sizeof(threads_count) / sizeof(*threads_count); ++t) {
|
302
|
|
- test(size, threads_count[t]);
|
|
306
|
+ for (unsigned t = threads_count; t > 1; t /= 2) {
|
|
307
|
+ test(size, t);
|
303
|
308
|
}
|
|
309
|
+ test(size, 0);
|
304
|
310
|
}
|
305
|
311
|
|
306
|
312
|
return 0;
|