|
@@ -0,0 +1,307 @@
|
|
1
|
+#define _POSIX_C_SOURCE 199309L
|
|
2
|
+#include <stdlib.h>
|
|
3
|
+#include <stdio.h>
|
|
4
|
+#include <time.h>
|
|
5
|
+#include <unistd.h>
|
|
6
|
+#include <string.h>
|
|
7
|
+#include <pthread.h>
|
|
8
|
+
|
|
9
|
+struct matrix
|
|
10
|
+{
|
|
11
|
+ unsigned m;
|
|
12
|
+ unsigned n;
|
|
13
|
+ int** scalars;
|
|
14
|
+};
|
|
15
|
+typedef struct matrix s_matrix;
|
|
16
|
+
|
|
17
|
+struct matrix_mult_thread_data
|
|
18
|
+{
|
|
19
|
+ s_matrix mat1;
|
|
20
|
+ s_matrix mat2;
|
|
21
|
+ s_matrix mat;
|
|
22
|
+ unsigned scalar_start;
|
|
23
|
+ unsigned scalars_count;
|
|
24
|
+ unsigned thread_number;
|
|
25
|
+};
|
|
26
|
+typedef struct matrix_mult_thread_data s_matrix_mult_thread_data;
|
|
27
|
+
|
|
28
|
+unsigned get_cpu_count(void)
|
|
29
|
+{
|
|
30
|
+ return (unsigned)sysconf(_SC_NPROCESSORS_ONLN);
|
|
31
|
+}
|
|
32
|
+
|
|
33
|
+s_matrix matrix_generate(unsigned m, unsigned n, unsigned rmax)
|
|
34
|
+{
|
|
35
|
+ s_matrix mat;
|
|
36
|
+ mat.m = m;
|
|
37
|
+ mat.n = n;
|
|
38
|
+
|
|
39
|
+ mat.scalars = malloc(mat.m * sizeof(int*));
|
|
40
|
+ for (unsigned i = 0; i < mat.m; ++i) {
|
|
41
|
+ mat.scalars[i] = malloc(mat.n * sizeof(int));
|
|
42
|
+ for (unsigned j = 0; j < mat.n; ++j) {
|
|
43
|
+ mat.scalars[i][j] = (rmax == 0 ? 0 : (rand() % rmax));
|
|
44
|
+ }
|
|
45
|
+ }
|
|
46
|
+
|
|
47
|
+ return mat;
|
|
48
|
+}
|
|
49
|
+
|
|
50
|
+void matrix_free(s_matrix mat)
|
|
51
|
+{
|
|
52
|
+ for (unsigned i = 0; i < mat.m; ++i) {
|
|
53
|
+ free(mat.scalars[i]);
|
|
54
|
+ }
|
|
55
|
+ free(mat.scalars);
|
|
56
|
+}
|
|
57
|
+
|
|
58
|
+void matrix_print(s_matrix mat)
|
|
59
|
+{
|
|
60
|
+ for (unsigned i = 0; i < mat.m; ++i) {
|
|
61
|
+ printf("|");
|
|
62
|
+ for (unsigned j = 0; j < mat.n; ++j) {
|
|
63
|
+ printf("%5d|", mat.scalars[i][j]);
|
|
64
|
+ }
|
|
65
|
+ printf("\n");
|
|
66
|
+ }
|
|
67
|
+}
|
|
68
|
+
|
|
69
|
+int matrix_equals(s_matrix mat1, s_matrix mat2)
|
|
70
|
+{
|
|
71
|
+ if (mat1.n != mat2.n || mat1.m != mat2.n) {
|
|
72
|
+ return 0;
|
|
73
|
+ }
|
|
74
|
+ for (unsigned i = 0; i < mat1.n; ++i) {
|
|
75
|
+ for (unsigned j = 0; j < mat1.m; ++j) {
|
|
76
|
+ if (mat1.scalars[i][j] != mat2.scalars[i][j]) {
|
|
77
|
+ return 0;
|
|
78
|
+ }
|
|
79
|
+ }
|
|
80
|
+ }
|
|
81
|
+ return 1;
|
|
82
|
+}
|
|
83
|
+
|
|
84
|
+unsigned matrix_mult_scalar(s_matrix mat1, s_matrix mat2, unsigned i, unsigned j)
|
|
85
|
+{
|
|
86
|
+ unsigned a = 0;
|
|
87
|
+ for (unsigned k = 0; k < mat1.n; ++k) {
|
|
88
|
+ a += mat1.scalars[i][k] * mat2.scalars[k][j];
|
|
89
|
+ }
|
|
90
|
+ return a;
|
|
91
|
+}
|
|
92
|
+
|
|
93
|
+s_matrix matrix_mult_sequential(s_matrix mat1, s_matrix mat2)
|
|
94
|
+{
|
|
95
|
+ s_matrix mat;
|
|
96
|
+ if (mat1.n != mat2.m) {
|
|
97
|
+ mat.n = 0;
|
|
98
|
+ mat.m = 0;
|
|
99
|
+ mat.scalars = 0;
|
|
100
|
+ }
|
|
101
|
+ else {
|
|
102
|
+ mat = matrix_generate(mat1.m, mat2.n, 0);
|
|
103
|
+ for (unsigned i = 0; i < mat.m; ++i) {
|
|
104
|
+ for (unsigned j = 0; j < mat.n; ++j) {
|
|
105
|
+ mat.scalars[i][j] = matrix_mult_scalar(mat1, mat2, i, j);
|
|
106
|
+ }
|
|
107
|
+ }
|
|
108
|
+ }
|
|
109
|
+ return mat;
|
|
110
|
+}
|
|
111
|
+
|
|
112
|
+void matrix_get_thread_scalars_distribution(unsigned scalars_count, unsigned thread_count, unsigned* distribution)
|
|
113
|
+{
|
|
114
|
+ unsigned scalars_per_thread = scalars_count / thread_count;
|
|
115
|
+ unsigned scalars_not_distributed = scalars_count;
|
|
116
|
+ if (scalars_per_thread == 0) {
|
|
117
|
+ scalars_per_thread = 1;
|
|
118
|
+ }
|
|
119
|
+ unsigned thread_number = 0;
|
|
120
|
+ for (; thread_number < thread_count && scalars_not_distributed > 0; ++thread_number) {
|
|
121
|
+ distribution[thread_number] = (thread_number == thread_count - 1 ? scalars_not_distributed : scalars_per_thread);
|
|
122
|
+ scalars_not_distributed -= distribution[thread_number];
|
|
123
|
+ }
|
|
124
|
+ for (; thread_number < thread_count; ++thread_number) {
|
|
125
|
+ distribution[thread_number] = 0;
|
|
126
|
+ }
|
|
127
|
+}
|
|
128
|
+
|
|
129
|
+void* matrix_mult_parallel_thread(void* arg)
|
|
130
|
+{
|
|
131
|
+ s_matrix_mult_thread_data* data = (s_matrix_mult_thread_data*)arg;
|
|
132
|
+
|
|
133
|
+ unsigned j = data->scalar_start % data->mat.m;
|
|
134
|
+
|
|
135
|
+ for (unsigned i = data->scalar_start / data->mat.m; i < data->mat.m && data->scalars_count > 0; ++i) {
|
|
136
|
+ for (; j < data->mat.n && data->scalars_count > 0; ++j) {
|
|
137
|
+ data->mat.scalars[i][j] = matrix_mult_scalar(data->mat1, data->mat2, i, j);
|
|
138
|
+ --data->scalars_count;
|
|
139
|
+ }
|
|
140
|
+ j = 0;
|
|
141
|
+ }
|
|
142
|
+
|
|
143
|
+ free(data);
|
|
144
|
+ return 0;
|
|
145
|
+}
|
|
146
|
+
|
|
147
|
+pthread_t matrix_mult_parallel_launch_thread(s_matrix mat1, s_matrix mat2, s_matrix mat, unsigned scalar_start,
|
|
148
|
+ unsigned scalars_count, unsigned thread_number, int launch)
|
|
149
|
+{
|
|
150
|
+ s_matrix_mult_thread_data* data = (s_matrix_mult_thread_data*)malloc(sizeof(s_matrix_mult_thread_data));
|
|
151
|
+ data->mat1 = mat1;
|
|
152
|
+ data->mat2 = mat2;
|
|
153
|
+ data->mat = mat;
|
|
154
|
+ data->scalar_start = scalar_start;
|
|
155
|
+ data->scalars_count = scalars_count;
|
|
156
|
+ data->thread_number = thread_number;
|
|
157
|
+ pthread_t thread = 0;
|
|
158
|
+ (void)launch;
|
|
159
|
+ if (launch) {
|
|
160
|
+ pthread_create(&thread, 0, matrix_mult_parallel_thread, data);
|
|
161
|
+ }
|
|
162
|
+ else {
|
|
163
|
+ matrix_mult_parallel_thread(data);
|
|
164
|
+ }
|
|
165
|
+ return thread;
|
|
166
|
+}
|
|
167
|
+
|
|
168
|
+s_matrix matrix_mult_parallel(s_matrix mat1, s_matrix mat2, unsigned thread_count)
|
|
169
|
+{
|
|
170
|
+ s_matrix mat;
|
|
171
|
+ if (mat1.n != mat2.m) {
|
|
172
|
+ mat.n = 0;
|
|
173
|
+ mat.m = 0;
|
|
174
|
+ mat.scalars = 0;
|
|
175
|
+ }
|
|
176
|
+ else {
|
|
177
|
+ mat = matrix_generate(mat1.m, mat2.n, 0);
|
|
178
|
+ unsigned scalars_count = mat1.m * mat2.n;
|
|
179
|
+ unsigned distribution[thread_count];
|
|
180
|
+ unsigned scalar_start = 0;
|
|
181
|
+ //unsigned scalar_start = distribution[0];
|
|
182
|
+ matrix_get_thread_scalars_distribution(scalars_count, thread_count, distribution);
|
|
183
|
+ pthread_t threads[thread_count];
|
|
184
|
+ //threads[0] = 0;
|
|
185
|
+
|
|
186
|
+ for (unsigned thread_number = 0; thread_number < thread_count; ++thread_number) {
|
|
187
|
+ unsigned scalars_count = distribution[thread_number];
|
|
188
|
+ if (scalars_count > 0) {
|
|
189
|
+ threads[thread_number] = matrix_mult_parallel_launch_thread(mat1, mat2, mat, scalar_start, scalars_count, thread_number, 1);
|
|
190
|
+ scalar_start += scalars_count;
|
|
191
|
+ }
|
|
192
|
+ else {
|
|
193
|
+ threads[thread_number] = 0;
|
|
194
|
+ }
|
|
195
|
+ }
|
|
196
|
+
|
|
197
|
+ //matrix_mult_parallel_launch_thread(mat1, mat2, mat, 0, distribution[0], 0, 0);
|
|
198
|
+
|
|
199
|
+ for (unsigned thread_number = 0; thread_number < thread_count; ++thread_number) {
|
|
200
|
+ pthread_t thread = threads[thread_number];
|
|
201
|
+ if (thread != 0) {
|
|
202
|
+ pthread_join(thread, 0);
|
|
203
|
+ }
|
|
204
|
+ }
|
|
205
|
+ }
|
|
206
|
+ return mat;
|
|
207
|
+}
|
|
208
|
+
|
|
209
|
+struct timespec get_time()
|
|
210
|
+{
|
|
211
|
+ struct timespec start_time;
|
|
212
|
+ clock_gettime(CLOCK_MONOTONIC, &start_time);
|
|
213
|
+ return start_time;
|
|
214
|
+}
|
|
215
|
+
|
|
216
|
+struct timespec time_diff(struct timespec* ts1, struct timespec* ts2)
|
|
217
|
+{
|
|
218
|
+ static struct timespec ts;
|
|
219
|
+ ts.tv_sec = ts1->tv_sec - ts2->tv_sec;
|
|
220
|
+ ts.tv_nsec = ts1->tv_nsec - ts2->tv_nsec;
|
|
221
|
+ if (ts.tv_nsec < 0) {
|
|
222
|
+ ts.tv_sec--;
|
|
223
|
+ ts.tv_nsec += 1000000000;
|
|
224
|
+ }
|
|
225
|
+ return ts;
|
|
226
|
+}
|
|
227
|
+
|
|
228
|
+struct timespec get_duration(struct timespec* ts)
|
|
229
|
+{
|
|
230
|
+ struct timespec time = get_time();
|
|
231
|
+ return time_diff(&time, ts);
|
|
232
|
+}
|
|
233
|
+
|
|
234
|
+void print_time(struct timespec* ts)
|
|
235
|
+{
|
|
236
|
+ long ns = ts->tv_nsec % 1000;
|
|
237
|
+ long us = (ts->tv_nsec / 1000) % 1000;
|
|
238
|
+ long ms = (ts->tv_nsec / 1000000) % 1000;
|
|
239
|
+ long s = (ts->tv_nsec / 1000000000) % 1000 + ts->tv_sec;
|
|
240
|
+ printf("%3lds %3ldms %3ldus %3ldns", s, ms, us, ns);
|
|
241
|
+}
|
|
242
|
+
|
|
243
|
+void test(unsigned size, unsigned thread_count)
|
|
244
|
+{
|
|
245
|
+ s_matrix mat1 = matrix_generate(size, size, 100);
|
|
246
|
+
|
|
247
|
+ s_matrix mat;
|
|
248
|
+ struct timespec start = get_time();
|
|
249
|
+ if (thread_count == 0) {
|
|
250
|
+ mat = matrix_mult_sequential(mat1, mat1);
|
|
251
|
+ }
|
|
252
|
+ else {
|
|
253
|
+ mat = matrix_mult_parallel(mat1, mat1, thread_count);
|
|
254
|
+ }
|
|
255
|
+ struct timespec time = get_duration(&start);
|
|
256
|
+ printf("%3dcpu %3dthreads %4d*%-4d ", get_cpu_count(), thread_count, size, size);
|
|
257
|
+ print_time(&time);
|
|
258
|
+ printf("\n");
|
|
259
|
+ matrix_free(mat);
|
|
260
|
+
|
|
261
|
+}
|
|
262
|
+
|
|
263
|
+void check()
|
|
264
|
+{
|
|
265
|
+ s_matrix mat = matrix_generate(3, 3, 0);
|
|
266
|
+ mat.scalars[0][0] = 25;
|
|
267
|
+ mat.scalars[0][1] = 26;
|
|
268
|
+ mat.scalars[0][2] = 90;
|
|
269
|
+ mat.scalars[1][0] = 14;
|
|
270
|
+ mat.scalars[1][1] = 36;
|
|
271
|
+ mat.scalars[1][2] = 1;
|
|
272
|
+ mat.scalars[2][0] = 3;
|
|
273
|
+ mat.scalars[2][1] = 9;
|
|
274
|
+ mat.scalars[2][2] = 6;
|
|
275
|
+
|
|
276
|
+ s_matrix mat1 = matrix_mult_sequential(mat, mat);
|
|
277
|
+ s_matrix mat2 = matrix_mult_parallel(mat, mat, 1);
|
|
278
|
+ s_matrix mat3 = matrix_mult_parallel(mat, mat, get_cpu_count());
|
|
279
|
+ if (!matrix_equals(mat1, mat2) || !matrix_equals(mat1, mat3)) {
|
|
280
|
+ matrix_print(mat1);
|
|
281
|
+ printf("\n");
|
|
282
|
+ matrix_print(mat2);
|
|
283
|
+ printf("\n");
|
|
284
|
+ matrix_print(mat3);
|
|
285
|
+ exit(1);
|
|
286
|
+ }
|
|
287
|
+}
|
|
288
|
+
|
|
289
|
+int main(void)
|
|
290
|
+{
|
|
291
|
+ srand(time(0));
|
|
292
|
+
|
|
293
|
+ check();
|
|
294
|
+
|
|
295
|
+ unsigned sizes[] = {10, 100};
|
|
296
|
+ unsigned threads_count[] = {1, 4, 16, 64};
|
|
297
|
+
|
|
298
|
+ for (unsigned s = 0; s < sizeof(sizes) / sizeof(*sizes); ++s) {
|
|
299
|
+ unsigned size = sizes[s];
|
|
300
|
+ test(size, 0);
|
|
301
|
+ for (unsigned t = 0; t < sizeof(threads_count) / sizeof(*threads_count); ++t) {
|
|
302
|
+ test(size, threads_count[t]);
|
|
303
|
+ }
|
|
304
|
+ }
|
|
305
|
+
|
|
306
|
+ return 0;
|
|
307
|
+}
|