LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1/*
2 * kmp_affinity.cpp -- affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_i18n.h"
16#include "kmp_io.h"
17#include "kmp_str.h"
18#include "kmp_wrapper_getpid.h"
19#if KMP_USE_HIER_SCHED
20#include "kmp_dispatch_hier.h"
21#endif
22#if KMP_USE_HWLOC
23// Copied from hwloc
24#define HWLOC_GROUP_KIND_INTEL_MODULE 102
25#define HWLOC_GROUP_KIND_INTEL_TILE 103
26#define HWLOC_GROUP_KIND_INTEL_DIE 104
27#define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28#endif
29
30// The machine topology
31kmp_topology_t *__kmp_topology = nullptr;
32// KMP_HW_SUBSET environment variable
33kmp_hw_subset_t *__kmp_hw_subset = nullptr;
34
35// Store the real or imagined machine hierarchy here
36static hierarchy_info machine_hierarchy;
37
38void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
39
40void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
41 kmp_uint32 depth;
42 // The test below is true if affinity is available, but set to "none". Need to
43 // init on first use of hierarchical barrier.
44 if (TCR_1(machine_hierarchy.uninitialized))
45 machine_hierarchy.init(nproc);
46
47 // Adjust the hierarchy in case num threads exceeds original
48 if (nproc > machine_hierarchy.base_num_threads)
49 machine_hierarchy.resize(nproc);
50
51 depth = machine_hierarchy.depth;
52 KMP_DEBUG_ASSERT(depth > 0);
53
54 thr_bar->depth = depth;
55 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
56 &(thr_bar->base_leaf_kids));
57 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
58}
59
60static int nCoresPerPkg, nPackages;
61static int __kmp_nThreadsPerCore;
62#ifndef KMP_DFLT_NTH_CORES
63static int __kmp_ncores;
64#endif
65
66const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
67 switch (type) {
68 case KMP_HW_SOCKET:
69 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
70 case KMP_HW_DIE:
71 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
72 case KMP_HW_MODULE:
73 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
74 case KMP_HW_TILE:
75 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
76 case KMP_HW_NUMA:
77 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
78 case KMP_HW_L3:
79 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
80 case KMP_HW_L2:
81 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
82 case KMP_HW_L1:
83 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
84 case KMP_HW_LLC:
85 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
86 case KMP_HW_CORE:
87 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
88 case KMP_HW_THREAD:
89 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
90 case KMP_HW_PROC_GROUP:
91 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
92 }
93 return KMP_I18N_STR(Unknown);
94}
95
96const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
97 switch (type) {
98 case KMP_HW_SOCKET:
99 return ((plural) ? "sockets" : "socket");
100 case KMP_HW_DIE:
101 return ((plural) ? "dice" : "die");
102 case KMP_HW_MODULE:
103 return ((plural) ? "modules" : "module");
104 case KMP_HW_TILE:
105 return ((plural) ? "tiles" : "tile");
106 case KMP_HW_NUMA:
107 return ((plural) ? "numa_domains" : "numa_domain");
108 case KMP_HW_L3:
109 return ((plural) ? "l3_caches" : "l3_cache");
110 case KMP_HW_L2:
111 return ((plural) ? "l2_caches" : "l2_cache");
112 case KMP_HW_L1:
113 return ((plural) ? "l1_caches" : "l1_cache");
114 case KMP_HW_LLC:
115 return ((plural) ? "ll_caches" : "ll_cache");
116 case KMP_HW_CORE:
117 return ((plural) ? "cores" : "core");
118 case KMP_HW_THREAD:
119 return ((plural) ? "threads" : "thread");
120 case KMP_HW_PROC_GROUP:
121 return ((plural) ? "proc_groups" : "proc_group");
122 }
123 return ((plural) ? "unknowns" : "unknown");
124}
125
127// kmp_hw_thread_t methods
128int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
129 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
130 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
131 int depth = __kmp_topology->get_depth();
132 for (int level = 0; level < depth; ++level) {
133 if (ahwthread->ids[level] < bhwthread->ids[level])
134 return -1;
135 else if (ahwthread->ids[level] > bhwthread->ids[level])
136 return 1;
137 }
138 if (ahwthread->os_id < bhwthread->os_id)
139 return -1;
140 else if (ahwthread->os_id > bhwthread->os_id)
141 return 1;
142 return 0;
143}
144
145#if KMP_AFFINITY_SUPPORTED
146int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
147 int i;
148 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
149 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
150 int depth = __kmp_topology->get_depth();
151 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
152 KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
153 for (i = 0; i < __kmp_affinity_compact; i++) {
154 int j = depth - i - 1;
155 if (aa->sub_ids[j] < bb->sub_ids[j])
156 return -1;
157 if (aa->sub_ids[j] > bb->sub_ids[j])
158 return 1;
159 }
160 for (; i < depth; i++) {
161 int j = i - __kmp_affinity_compact;
162 if (aa->sub_ids[j] < bb->sub_ids[j])
163 return -1;
164 if (aa->sub_ids[j] > bb->sub_ids[j])
165 return 1;
166 }
167 return 0;
168}
169#endif
170
171void kmp_hw_thread_t::print() const {
172 int depth = __kmp_topology->get_depth();
173 printf("%4d ", os_id);
174 for (int i = 0; i < depth; ++i) {
175 printf("%4d ", ids[i]);
176 }
177 printf("\n");
178}
179
181// kmp_topology_t methods
182
183// Remove layers that don't add information to the topology.
184// This is done by having the layer take on the id = UNKNOWN_ID (-1)
185void kmp_topology_t::_remove_radix1_layers() {
186 int preference[KMP_HW_LAST];
187 int top_index1, top_index2;
188 // Set up preference associative array
189 preference[KMP_HW_PROC_GROUP] = 110;
190 preference[KMP_HW_SOCKET] = 100;
191 preference[KMP_HW_CORE] = 95;
192 preference[KMP_HW_THREAD] = 90;
193 preference[KMP_HW_NUMA] = 85;
194 preference[KMP_HW_DIE] = 80;
195 preference[KMP_HW_TILE] = 75;
196 preference[KMP_HW_MODULE] = 73;
197 preference[KMP_HW_L3] = 70;
198 preference[KMP_HW_L2] = 65;
199 preference[KMP_HW_L1] = 60;
200 preference[KMP_HW_LLC] = 5;
201 top_index1 = 0;
202 top_index2 = 1;
203 while (top_index1 < depth - 1 && top_index2 < depth) {
204 kmp_hw_t type1 = types[top_index1];
205 kmp_hw_t type2 = types[top_index2];
206 KMP_ASSERT_VALID_HW_TYPE(type1);
207 KMP_ASSERT_VALID_HW_TYPE(type2);
208 // Do not allow the three main topology levels (sockets, cores, threads) to
209 // be compacted down
210 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
211 type1 == KMP_HW_SOCKET) &&
212 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
213 type2 == KMP_HW_SOCKET)) {
214 top_index1 = top_index2++;
215 continue;
216 }
217 bool radix1 = true;
218 bool all_same = true;
219 int id1 = hw_threads[0].ids[top_index1];
220 int id2 = hw_threads[0].ids[top_index2];
221 int pref1 = preference[type1];
222 int pref2 = preference[type2];
223 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
224 if (hw_threads[hwidx].ids[top_index1] == id1 &&
225 hw_threads[hwidx].ids[top_index2] != id2) {
226 radix1 = false;
227 break;
228 }
229 if (hw_threads[hwidx].ids[top_index2] != id2)
230 all_same = false;
231 id1 = hw_threads[hwidx].ids[top_index1];
232 id2 = hw_threads[hwidx].ids[top_index2];
233 }
234 if (radix1) {
235 // Select the layer to remove based on preference
236 kmp_hw_t remove_type, keep_type;
237 int remove_layer, remove_layer_ids;
238 if (pref1 > pref2) {
239 remove_type = type2;
240 remove_layer = remove_layer_ids = top_index2;
241 keep_type = type1;
242 } else {
243 remove_type = type1;
244 remove_layer = remove_layer_ids = top_index1;
245 keep_type = type2;
246 }
247 // If all the indexes for the second (deeper) layer are the same.
248 // e.g., all are zero, then make sure to keep the first layer's ids
249 if (all_same)
250 remove_layer_ids = top_index2;
251 // Remove radix one type by setting the equivalence, removing the id from
252 // the hw threads and removing the layer from types and depth
253 set_equivalent_type(remove_type, keep_type);
254 for (int idx = 0; idx < num_hw_threads; ++idx) {
255 kmp_hw_thread_t &hw_thread = hw_threads[idx];
256 for (int d = remove_layer_ids; d < depth - 1; ++d)
257 hw_thread.ids[d] = hw_thread.ids[d + 1];
258 }
259 for (int idx = remove_layer; idx < depth - 1; ++idx)
260 types[idx] = types[idx + 1];
261 depth--;
262 } else {
263 top_index1 = top_index2++;
264 }
265 }
266 KMP_ASSERT(depth > 0);
267}
268
269void kmp_topology_t::_set_last_level_cache() {
270 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
271 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
272 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
273 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
274#if KMP_MIC_SUPPORTED
275 else if (__kmp_mic_type == mic3) {
276 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
277 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
278 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
279 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
280 // L2/Tile wasn't detected so just say L1
281 else
282 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
283 }
284#endif
285 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
286 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
287 // Fallback is to set last level cache to socket or core
288 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
289 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
290 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
291 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
292 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
293 }
294 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
295}
296
297// Gather the count of each topology layer and the ratio
298void kmp_topology_t::_gather_enumeration_information() {
299 int previous_id[KMP_HW_LAST];
300 int max[KMP_HW_LAST];
301
302 for (int i = 0; i < depth; ++i) {
303 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
304 max[i] = 0;
305 count[i] = 0;
306 ratio[i] = 0;
307 }
308 for (int i = 0; i < num_hw_threads; ++i) {
309 kmp_hw_thread_t &hw_thread = hw_threads[i];
310 for (int layer = 0; layer < depth; ++layer) {
311 int id = hw_thread.ids[layer];
312 if (id != previous_id[layer]) {
313 // Add an additional increment to each count
314 for (int l = layer; l < depth; ++l)
315 count[l]++;
316 // Keep track of topology layer ratio statistics
317 max[layer]++;
318 for (int l = layer + 1; l < depth; ++l) {
319 if (max[l] > ratio[l])
320 ratio[l] = max[l];
321 max[l] = 1;
322 }
323 break;
324 }
325 }
326 for (int layer = 0; layer < depth; ++layer) {
327 previous_id[layer] = hw_thread.ids[layer];
328 }
329 }
330 for (int layer = 0; layer < depth; ++layer) {
331 if (max[layer] > ratio[layer])
332 ratio[layer] = max[layer];
333 }
334}
335
336// Find out if the topology is uniform
337void kmp_topology_t::_discover_uniformity() {
338 int num = 1;
339 for (int level = 0; level < depth; ++level)
340 num *= ratio[level];
341 flags.uniform = (num == count[depth - 1]);
342}
343
344// Set all the sub_ids for each hardware thread
345void kmp_topology_t::_set_sub_ids() {
346 int previous_id[KMP_HW_LAST];
347 int sub_id[KMP_HW_LAST];
348
349 for (int i = 0; i < depth; ++i) {
350 previous_id[i] = -1;
351 sub_id[i] = -1;
352 }
353 for (int i = 0; i < num_hw_threads; ++i) {
354 kmp_hw_thread_t &hw_thread = hw_threads[i];
355 // Setup the sub_id
356 for (int j = 0; j < depth; ++j) {
357 if (hw_thread.ids[j] != previous_id[j]) {
358 sub_id[j]++;
359 for (int k = j + 1; k < depth; ++k) {
360 sub_id[k] = 0;
361 }
362 break;
363 }
364 }
365 // Set previous_id
366 for (int j = 0; j < depth; ++j) {
367 previous_id[j] = hw_thread.ids[j];
368 }
369 // Set the sub_ids field
370 for (int j = 0; j < depth; ++j) {
371 hw_thread.sub_ids[j] = sub_id[j];
372 }
373 }
374}
375
376void kmp_topology_t::_set_globals() {
377 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
378 int core_level, thread_level, package_level;
379 package_level = get_level(KMP_HW_SOCKET);
380#if KMP_GROUP_AFFINITY
381 if (package_level == -1)
382 package_level = get_level(KMP_HW_PROC_GROUP);
383#endif
384 core_level = get_level(KMP_HW_CORE);
385 thread_level = get_level(KMP_HW_THREAD);
386
387 KMP_ASSERT(core_level != -1);
388 KMP_ASSERT(thread_level != -1);
389
390 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
391 if (package_level != -1) {
392 nCoresPerPkg = calculate_ratio(core_level, package_level);
393 nPackages = get_count(package_level);
394 } else {
395 // assume one socket
396 nCoresPerPkg = get_count(core_level);
397 nPackages = 1;
398 }
399#ifndef KMP_DFLT_NTH_CORES
400 __kmp_ncores = get_count(core_level);
401#endif
402}
403
404kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
405 const kmp_hw_t *types) {
406 kmp_topology_t *retval;
407 // Allocate all data in one large allocation
408 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
409 sizeof(int) * ndepth * 3;
410 char *bytes = (char *)__kmp_allocate(size);
411 retval = (kmp_topology_t *)bytes;
412 if (nproc > 0) {
413 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
414 } else {
415 retval->hw_threads = nullptr;
416 }
417 retval->num_hw_threads = nproc;
418 retval->depth = ndepth;
419 int *arr =
420 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
421 retval->types = (kmp_hw_t *)arr;
422 retval->ratio = arr + ndepth;
423 retval->count = arr + 2 * ndepth;
424 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
425 for (int i = 0; i < ndepth; ++i) {
426 retval->types[i] = types[i];
427 retval->equivalent[types[i]] = types[i];
428 }
429 return retval;
430}
431
432void kmp_topology_t::deallocate(kmp_topology_t *topology) {
433 if (topology)
434 __kmp_free(topology);
435}
436
437bool kmp_topology_t::check_ids() const {
438 // Assume ids have been sorted
439 if (num_hw_threads == 0)
440 return true;
441 for (int i = 1; i < num_hw_threads; ++i) {
442 kmp_hw_thread_t &current_thread = hw_threads[i];
443 kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
444 bool unique = false;
445 for (int j = 0; j < depth; ++j) {
446 if (previous_thread.ids[j] != current_thread.ids[j]) {
447 unique = true;
448 break;
449 }
450 }
451 if (unique)
452 continue;
453 return false;
454 }
455 return true;
456}
457
458void kmp_topology_t::dump() const {
459 printf("***********************\n");
460 printf("*** __kmp_topology: ***\n");
461 printf("***********************\n");
462 printf("* depth: %d\n", depth);
463
464 printf("* types: ");
465 for (int i = 0; i < depth; ++i)
466 printf("%15s ", __kmp_hw_get_keyword(types[i]));
467 printf("\n");
468
469 printf("* ratio: ");
470 for (int i = 0; i < depth; ++i) {
471 printf("%15d ", ratio[i]);
472 }
473 printf("\n");
474
475 printf("* count: ");
476 for (int i = 0; i < depth; ++i) {
477 printf("%15d ", count[i]);
478 }
479 printf("\n");
480
481 printf("* equivalent map:\n");
482 KMP_FOREACH_HW_TYPE(i) {
483 const char *key = __kmp_hw_get_keyword(i);
484 const char *value = __kmp_hw_get_keyword(equivalent[i]);
485 printf("%-15s -> %-15s\n", key, value);
486 }
487
488 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
489
490 printf("* num_hw_threads: %d\n", num_hw_threads);
491 printf("* hw_threads:\n");
492 for (int i = 0; i < num_hw_threads; ++i) {
493 hw_threads[i].print();
494 }
495 printf("***********************\n");
496}
497
498void kmp_topology_t::print(const char *env_var) const {
499 kmp_str_buf_t buf;
500 int print_types_depth;
501 __kmp_str_buf_init(&buf);
502 kmp_hw_t print_types[KMP_HW_LAST + 2];
503
504 // Num Available Threads
505 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
506
507 // Uniform or not
508 if (is_uniform()) {
509 KMP_INFORM(Uniform, env_var);
510 } else {
511 KMP_INFORM(NonUniform, env_var);
512 }
513
514 // Equivalent types
515 KMP_FOREACH_HW_TYPE(type) {
516 kmp_hw_t eq_type = equivalent[type];
517 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
518 KMP_INFORM(AffEqualTopologyTypes, env_var,
519 __kmp_hw_get_catalog_string(type),
520 __kmp_hw_get_catalog_string(eq_type));
521 }
522 }
523
524 // Quick topology
525 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
526 // Create a print types array that always guarantees printing
527 // the core and thread level
528 print_types_depth = 0;
529 for (int level = 0; level < depth; ++level)
530 print_types[print_types_depth++] = types[level];
531 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
532 // Force in the core level for quick topology
533 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
534 // Force core before thread e.g., 1 socket X 2 threads/socket
535 // becomes 1 socket X 1 core/socket X 2 threads/socket
536 print_types[print_types_depth - 1] = KMP_HW_CORE;
537 print_types[print_types_depth++] = KMP_HW_THREAD;
538 } else {
539 print_types[print_types_depth++] = KMP_HW_CORE;
540 }
541 }
542 // Always put threads at very end of quick topology
543 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
544 print_types[print_types_depth++] = KMP_HW_THREAD;
545
546 __kmp_str_buf_clear(&buf);
547 kmp_hw_t numerator_type;
548 kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
549 int core_level = get_level(KMP_HW_CORE);
550 int ncores = get_count(core_level);
551
552 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
553 int c;
554 bool plural;
555 numerator_type = print_types[plevel];
556 KMP_ASSERT_VALID_HW_TYPE(numerator_type);
557 if (equivalent[numerator_type] != numerator_type)
558 c = 1;
559 else
560 c = get_ratio(level++);
561 plural = (c > 1);
562 if (plevel == 0) {
563 __kmp_str_buf_print(&buf, "%d %s", c,
564 __kmp_hw_get_catalog_string(numerator_type, plural));
565 } else {
566 __kmp_str_buf_print(&buf, " x %d %s/%s", c,
567 __kmp_hw_get_catalog_string(numerator_type, plural),
568 __kmp_hw_get_catalog_string(denominator_type));
569 }
570 denominator_type = numerator_type;
571 }
572 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
573
574 if (num_hw_threads <= 0) {
575 __kmp_str_buf_free(&buf);
576 return;
577 }
578
579 // Full OS proc to hardware thread map
580 KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
581 for (int i = 0; i < num_hw_threads; i++) {
582 __kmp_str_buf_clear(&buf);
583 for (int level = 0; level < depth; ++level) {
584 kmp_hw_t type = types[level];
585 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
586 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
587 }
588 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
589 }
590
591 __kmp_str_buf_free(&buf);
592}
593
594void kmp_topology_t::canonicalize() {
595 _remove_radix1_layers();
596 _gather_enumeration_information();
597 _discover_uniformity();
598 _set_sub_ids();
599 _set_globals();
600 _set_last_level_cache();
601
602#if KMP_MIC_SUPPORTED
603 // Manually Add L2 = Tile equivalence
604 if (__kmp_mic_type == mic3) {
605 if (get_level(KMP_HW_L2) != -1)
606 set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
607 else if (get_level(KMP_HW_TILE) != -1)
608 set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
609 }
610#endif
611
612 // Perform post canonicalization checking
613 KMP_ASSERT(depth > 0);
614 for (int level = 0; level < depth; ++level) {
615 // All counts, ratios, and types must be valid
616 KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
617 KMP_ASSERT_VALID_HW_TYPE(types[level]);
618 // Detected types must point to themselves
619 KMP_ASSERT(equivalent[types[level]] == types[level]);
620 }
621
622#if KMP_AFFINITY_SUPPORTED
623 // Set the number of affinity granularity levels
624 if (__kmp_affinity_gran_levels < 0) {
625 kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran);
626 // Check if user's granularity request is valid
627 if (gran_type == KMP_HW_UNKNOWN) {
628 // First try core, then thread, then package
629 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
630 for (auto g : gran_types) {
631 if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) {
632 gran_type = g;
633 break;
634 }
635 }
636 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
637 // Warn user what granularity setting will be used instead
638 KMP_WARNING(AffGranularityBad, "KMP_AFFINITY",
639 __kmp_hw_get_catalog_string(__kmp_affinity_gran),
640 __kmp_hw_get_catalog_string(gran_type));
641 __kmp_affinity_gran = gran_type;
642 }
643 __kmp_affinity_gran_levels = 0;
644 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
645 __kmp_affinity_gran_levels++;
646 }
647#endif // KMP_AFFINITY_SUPPORTED
648}
649
650// Canonicalize an explicit packages X cores/pkg X threads/core topology
651void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
652 int nthreads_per_core, int ncores) {
653 int ndepth = 3;
654 depth = ndepth;
655 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
656 for (int level = 0; level < depth; ++level) {
657 count[level] = 0;
658 ratio[level] = 0;
659 }
660 count[0] = npackages;
661 count[1] = ncores;
662 count[2] = __kmp_xproc;
663 ratio[0] = npackages;
664 ratio[1] = ncores_per_pkg;
665 ratio[2] = nthreads_per_core;
666 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
667 equivalent[KMP_HW_CORE] = KMP_HW_CORE;
668 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
669 types[0] = KMP_HW_SOCKET;
670 types[1] = KMP_HW_CORE;
671 types[2] = KMP_HW_THREAD;
672 //__kmp_avail_proc = __kmp_xproc;
673 _discover_uniformity();
674}
675
676// Apply the KMP_HW_SUBSET envirable to the topology
677// Returns true if KMP_HW_SUBSET filtered any processors
678// otherwise, returns false
679bool kmp_topology_t::filter_hw_subset() {
680 // If KMP_HW_SUBSET wasn't requested, then do nothing.
681 if (!__kmp_hw_subset)
682 return false;
683
684 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
685 int hw_subset_depth = __kmp_hw_subset->get_depth();
686 kmp_hw_t specified[KMP_HW_LAST];
687 KMP_ASSERT(hw_subset_depth > 0);
688 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
689 for (int i = 0; i < hw_subset_depth; ++i) {
690 int max_count;
691 int num = __kmp_hw_subset->at(i).num;
692 int offset = __kmp_hw_subset->at(i).offset;
693 kmp_hw_t type = __kmp_hw_subset->at(i).type;
694 kmp_hw_t equivalent_type = equivalent[type];
695 int level = get_level(type);
696
697 // Check to see if current layer is in detected machine topology
698 if (equivalent_type != KMP_HW_UNKNOWN) {
699 __kmp_hw_subset->at(i).type = equivalent_type;
700 } else {
701 KMP_WARNING(AffHWSubsetNotExistGeneric,
702 __kmp_hw_get_catalog_string(type));
703 return false;
704 }
705
706 // Check to see if current layer has already been specified
707 // either directly or through an equivalent type
708 if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
709 KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type),
710 __kmp_hw_get_catalog_string(specified[equivalent_type]));
711 return false;
712 }
713 specified[equivalent_type] = type;
714
715 // Check to see if layers are in order
716 if (i + 1 < hw_subset_depth) {
717 kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type);
718 if (next_type == KMP_HW_UNKNOWN) {
719 KMP_WARNING(
720 AffHWSubsetNotExistGeneric,
721 __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type));
722 return false;
723 }
724 int next_topology_level = get_level(next_type);
725 if (level > next_topology_level) {
726 KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type),
727 __kmp_hw_get_catalog_string(next_type));
728 return false;
729 }
730 }
731
732 // Check to see if each layer's num & offset parameters are valid
733 max_count = get_ratio(level);
734 if (max_count < 0 || num + offset > max_count) {
735 bool plural = (num > 1);
736 KMP_WARNING(AffHWSubsetManyGeneric,
737 __kmp_hw_get_catalog_string(type, plural));
738 return false;
739 }
740 }
741
742 // Apply the filtered hardware subset
743 int new_index = 0;
744 for (int i = 0; i < num_hw_threads; ++i) {
745 kmp_hw_thread_t &hw_thread = hw_threads[i];
746 // Check to see if this hardware thread should be filtered
747 bool should_be_filtered = false;
748 for (int level = 0, hw_subset_index = 0;
749 level < depth && hw_subset_index < hw_subset_depth; ++level) {
750 kmp_hw_t topology_type = types[level];
751 auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
752 kmp_hw_t hw_subset_type = hw_subset_item.type;
753 if (topology_type != hw_subset_type)
754 continue;
755 int num = hw_subset_item.num;
756 int offset = hw_subset_item.offset;
757 hw_subset_index++;
758 if (hw_thread.sub_ids[level] < offset ||
759 hw_thread.sub_ids[level] >= offset + num) {
760 should_be_filtered = true;
761 break;
762 }
763 }
764 if (!should_be_filtered) {
765 if (i != new_index)
766 hw_threads[new_index] = hw_thread;
767 new_index++;
768 } else {
769#if KMP_AFFINITY_SUPPORTED
770 KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask);
771#endif
772 __kmp_avail_proc--;
773 }
774 }
775 KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
776 num_hw_threads = new_index;
777
778 // Post hardware subset canonicalization
779 _gather_enumeration_information();
780 _discover_uniformity();
781 _set_globals();
782 _set_last_level_cache();
783 return true;
784}
785
786bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
787 if (hw_level >= depth)
788 return true;
789 bool retval = true;
790 const kmp_hw_thread_t &t1 = hw_threads[hwt1];
791 const kmp_hw_thread_t &t2 = hw_threads[hwt2];
792 for (int i = 0; i < (depth - hw_level); ++i) {
793 if (t1.ids[i] != t2.ids[i])
794 return false;
795 }
796 return retval;
797}
798
800
801#if KMP_AFFINITY_SUPPORTED
802class kmp_affinity_raii_t {
803 kmp_affin_mask_t *mask;
804 bool restored;
805
806public:
807 kmp_affinity_raii_t() : restored(false) {
808 KMP_CPU_ALLOC(mask);
809 KMP_ASSERT(mask != NULL);
810 __kmp_get_system_affinity(mask, TRUE);
811 }
812 void restore() {
813 __kmp_set_system_affinity(mask, TRUE);
814 KMP_CPU_FREE(mask);
815 restored = true;
816 }
817 ~kmp_affinity_raii_t() {
818 if (!restored) {
819 __kmp_set_system_affinity(mask, TRUE);
820 KMP_CPU_FREE(mask);
821 }
822 }
823};
824
825bool KMPAffinity::picked_api = false;
826
827void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
828void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
829void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
830void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
831void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
832void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
833
834void KMPAffinity::pick_api() {
835 KMPAffinity *affinity_dispatch;
836 if (picked_api)
837 return;
838#if KMP_USE_HWLOC
839 // Only use Hwloc if affinity isn't explicitly disabled and
840 // user requests Hwloc topology method
841 if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
842 __kmp_affinity_type != affinity_disabled) {
843 affinity_dispatch = new KMPHwlocAffinity();
844 } else
845#endif
846 {
847 affinity_dispatch = new KMPNativeAffinity();
848 }
849 __kmp_affinity_dispatch = affinity_dispatch;
850 picked_api = true;
851}
852
853void KMPAffinity::destroy_api() {
854 if (__kmp_affinity_dispatch != NULL) {
855 delete __kmp_affinity_dispatch;
856 __kmp_affinity_dispatch = NULL;
857 picked_api = false;
858 }
859}
860
861#define KMP_ADVANCE_SCAN(scan) \
862 while (*scan != '\0') { \
863 scan++; \
864 }
865
866// Print the affinity mask to the character array in a pretty format.
867// The format is a comma separated list of non-negative integers or integer
868// ranges: e.g., 1,2,3-5,7,9-15
869// The format can also be the string "{<empty>}" if no bits are set in mask
870char *__kmp_affinity_print_mask(char *buf, int buf_len,
871 kmp_affin_mask_t *mask) {
872 int start = 0, finish = 0, previous = 0;
873 bool first_range;
874 KMP_ASSERT(buf);
875 KMP_ASSERT(buf_len >= 40);
876 KMP_ASSERT(mask);
877 char *scan = buf;
878 char *end = buf + buf_len - 1;
879
880 // Check for empty set.
881 if (mask->begin() == mask->end()) {
882 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
883 KMP_ADVANCE_SCAN(scan);
884 KMP_ASSERT(scan <= end);
885 return buf;
886 }
887
888 first_range = true;
889 start = mask->begin();
890 while (1) {
891 // Find next range
892 // [start, previous] is inclusive range of contiguous bits in mask
893 for (finish = mask->next(start), previous = start;
894 finish == previous + 1 && finish != mask->end();
895 finish = mask->next(finish)) {
896 previous = finish;
897 }
898
899 // The first range does not need a comma printed before it, but the rest
900 // of the ranges do need a comma beforehand
901 if (!first_range) {
902 KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
903 KMP_ADVANCE_SCAN(scan);
904 } else {
905 first_range = false;
906 }
907 // Range with three or more contiguous bits in the affinity mask
908 if (previous - start > 1) {
909 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
910 } else {
911 // Range with one or two contiguous bits in the affinity mask
912 KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
913 KMP_ADVANCE_SCAN(scan);
914 if (previous - start > 0) {
915 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
916 }
917 }
918 KMP_ADVANCE_SCAN(scan);
919 // Start over with new start point
920 start = finish;
921 if (start == mask->end())
922 break;
923 // Check for overflow
924 if (end - scan < 2)
925 break;
926 }
927
928 // Check for overflow
929 KMP_ASSERT(scan <= end);
930 return buf;
931}
932#undef KMP_ADVANCE_SCAN
933
934// Print the affinity mask to the string buffer object in a pretty format
935// The format is a comma separated list of non-negative integers or integer
936// ranges: e.g., 1,2,3-5,7,9-15
937// The format can also be the string "{<empty>}" if no bits are set in mask
938kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
939 kmp_affin_mask_t *mask) {
940 int start = 0, finish = 0, previous = 0;
941 bool first_range;
942 KMP_ASSERT(buf);
943 KMP_ASSERT(mask);
944
945 __kmp_str_buf_clear(buf);
946
947 // Check for empty set.
948 if (mask->begin() == mask->end()) {
949 __kmp_str_buf_print(buf, "%s", "{<empty>}");
950 return buf;
951 }
952
953 first_range = true;
954 start = mask->begin();
955 while (1) {
956 // Find next range
957 // [start, previous] is inclusive range of contiguous bits in mask
958 for (finish = mask->next(start), previous = start;
959 finish == previous + 1 && finish != mask->end();
960 finish = mask->next(finish)) {
961 previous = finish;
962 }
963
964 // The first range does not need a comma printed before it, but the rest
965 // of the ranges do need a comma beforehand
966 if (!first_range) {
967 __kmp_str_buf_print(buf, "%s", ",");
968 } else {
969 first_range = false;
970 }
971 // Range with three or more contiguous bits in the affinity mask
972 if (previous - start > 1) {
973 __kmp_str_buf_print(buf, "%u-%u", start, previous);
974 } else {
975 // Range with one or two contiguous bits in the affinity mask
976 __kmp_str_buf_print(buf, "%u", start);
977 if (previous - start > 0) {
978 __kmp_str_buf_print(buf, ",%u", previous);
979 }
980 }
981 // Start over with new start point
982 start = finish;
983 if (start == mask->end())
984 break;
985 }
986 return buf;
987}
988
989void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
990 KMP_CPU_ZERO(mask);
991
992#if KMP_GROUP_AFFINITY
993
994 if (__kmp_num_proc_groups > 1) {
995 int group;
996 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
997 for (group = 0; group < __kmp_num_proc_groups; group++) {
998 int i;
999 int num = __kmp_GetActiveProcessorCount(group);
1000 for (i = 0; i < num; i++) {
1001 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1002 }
1003 }
1004 } else
1005
1006#endif /* KMP_GROUP_AFFINITY */
1007
1008 {
1009 int proc;
1010 for (proc = 0; proc < __kmp_xproc; proc++) {
1011 KMP_CPU_SET(proc, mask);
1012 }
1013 }
1014}
1015
1016// All of the __kmp_affinity_create_*_map() routines should allocate the
1017// internal topology object and set the layer ids for it. Each routine
1018// returns a boolean on whether it was successful at doing so.
1019kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1020
1021#if KMP_USE_HWLOC
1022static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1023#if HWLOC_API_VERSION >= 0x00020000
1024 return hwloc_obj_type_is_cache(obj->type);
1025#else
1026 return obj->type == HWLOC_OBJ_CACHE;
1027#endif
1028}
1029
1030// Returns KMP_HW_* type derived from HWLOC_* type
1031static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1032
1033 if (__kmp_hwloc_is_cache_type(obj)) {
1034 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1035 return KMP_HW_UNKNOWN;
1036 switch (obj->attr->cache.depth) {
1037 case 1:
1038 return KMP_HW_L1;
1039 case 2:
1040#if KMP_MIC_SUPPORTED
1041 if (__kmp_mic_type == mic3) {
1042 return KMP_HW_TILE;
1043 }
1044#endif
1045 return KMP_HW_L2;
1046 case 3:
1047 return KMP_HW_L3;
1048 }
1049 return KMP_HW_UNKNOWN;
1050 }
1051
1052 switch (obj->type) {
1053 case HWLOC_OBJ_PACKAGE:
1054 return KMP_HW_SOCKET;
1055 case HWLOC_OBJ_NUMANODE:
1056 return KMP_HW_NUMA;
1057 case HWLOC_OBJ_CORE:
1058 return KMP_HW_CORE;
1059 case HWLOC_OBJ_PU:
1060 return KMP_HW_THREAD;
1061 case HWLOC_OBJ_GROUP:
1062 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1063 return KMP_HW_DIE;
1064 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1065 return KMP_HW_TILE;
1066 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1067 return KMP_HW_MODULE;
1068 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1069 return KMP_HW_PROC_GROUP;
1070 return KMP_HW_UNKNOWN;
1071#if HWLOC_API_VERSION >= 0x00020100
1072 case HWLOC_OBJ_DIE:
1073 return KMP_HW_DIE;
1074#endif
1075 }
1076 return KMP_HW_UNKNOWN;
1077}
1078
1079// Returns the number of objects of type 'type' below 'obj' within the topology
1080// tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1081// HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1082// object.
1083static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1084 hwloc_obj_type_t type) {
1085 int retval = 0;
1086 hwloc_obj_t first;
1087 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1088 obj->logical_index, type, 0);
1089 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1090 obj->type, first) == obj;
1091 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1092 first)) {
1093 ++retval;
1094 }
1095 return retval;
1096}
1097
1098// This gets the sub_id for a lower object under a higher object in the
1099// topology tree
1100static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1101 hwloc_obj_t lower) {
1102 hwloc_obj_t obj;
1103 hwloc_obj_type_t ltype = lower->type;
1104 int lindex = lower->logical_index - 1;
1105 int sub_id = 0;
1106 // Get the previous lower object
1107 obj = hwloc_get_obj_by_type(t, ltype, lindex);
1108 while (obj && lindex >= 0 &&
1109 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1110 if (obj->userdata) {
1111 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1112 break;
1113 }
1114 sub_id++;
1115 lindex--;
1116 obj = hwloc_get_obj_by_type(t, ltype, lindex);
1117 }
1118 // store sub_id + 1 so that 0 is differed from NULL
1119 lower->userdata = RCAST(void *, sub_id + 1);
1120 return sub_id;
1121}
1122
1123static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1124 kmp_hw_t type;
1125 int hw_thread_index, sub_id;
1126 int depth;
1127 hwloc_obj_t pu, obj, root, prev;
1128 kmp_hw_t types[KMP_HW_LAST];
1129 hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1130
1131 hwloc_topology_t tp = __kmp_hwloc_topology;
1132 *msg_id = kmp_i18n_null;
1133 if (__kmp_affinity_verbose) {
1134 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1135 }
1136
1137 if (!KMP_AFFINITY_CAPABLE()) {
1138 // Hack to try and infer the machine topology using only the data
1139 // available from hwloc on the current thread, and __kmp_xproc.
1140 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1141 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1142 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1143 if (o != NULL)
1144 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1145 else
1146 nCoresPerPkg = 1; // no PACKAGE found
1147 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1148 if (o != NULL)
1149 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1150 else
1151 __kmp_nThreadsPerCore = 1; // no CORE found
1152 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1153 if (nCoresPerPkg == 0)
1154 nCoresPerPkg = 1; // to prevent possible division by 0
1155 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1156 return true;
1157 }
1158
1159 root = hwloc_get_root_obj(tp);
1160
1161 // Figure out the depth and types in the topology
1162 depth = 0;
1163 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1164 KMP_ASSERT(pu);
1165 obj = pu;
1166 types[depth] = KMP_HW_THREAD;
1167 hwloc_types[depth] = obj->type;
1168 depth++;
1169 while (obj != root && obj != NULL) {
1170 obj = obj->parent;
1171#if HWLOC_API_VERSION >= 0x00020000
1172 if (obj->memory_arity) {
1173 hwloc_obj_t memory;
1174 for (memory = obj->memory_first_child; memory;
1175 memory = hwloc_get_next_child(tp, obj, memory)) {
1176 if (memory->type == HWLOC_OBJ_NUMANODE)
1177 break;
1178 }
1179 if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1180 types[depth] = KMP_HW_NUMA;
1181 hwloc_types[depth] = memory->type;
1182 depth++;
1183 }
1184 }
1185#endif
1186 type = __kmp_hwloc_type_2_topology_type(obj);
1187 if (type != KMP_HW_UNKNOWN) {
1188 types[depth] = type;
1189 hwloc_types[depth] = obj->type;
1190 depth++;
1191 }
1192 }
1193 KMP_ASSERT(depth > 0);
1194
1195 // Get the order for the types correct
1196 for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1197 hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1198 kmp_hw_t temp = types[i];
1199 types[i] = types[j];
1200 types[j] = temp;
1201 hwloc_types[i] = hwloc_types[j];
1202 hwloc_types[j] = hwloc_temp;
1203 }
1204
1205 // Allocate the data structure to be returned.
1206 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1207
1208 hw_thread_index = 0;
1209 pu = NULL;
1210 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) {
1211 int index = depth - 1;
1212 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1213 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1214 if (included) {
1215 hw_thread.clear();
1216 hw_thread.ids[index] = pu->logical_index;
1217 hw_thread.os_id = pu->os_index;
1218 index--;
1219 }
1220 obj = pu;
1221 prev = obj;
1222 while (obj != root && obj != NULL) {
1223 obj = obj->parent;
1224#if HWLOC_API_VERSION >= 0x00020000
1225 // NUMA Nodes are handled differently since they are not within the
1226 // parent/child structure anymore. They are separate children
1227 // of obj (memory_first_child points to first memory child)
1228 if (obj->memory_arity) {
1229 hwloc_obj_t memory;
1230 for (memory = obj->memory_first_child; memory;
1231 memory = hwloc_get_next_child(tp, obj, memory)) {
1232 if (memory->type == HWLOC_OBJ_NUMANODE)
1233 break;
1234 }
1235 if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1236 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1237 if (included) {
1238 hw_thread.ids[index] = memory->logical_index;
1239 hw_thread.ids[index + 1] = sub_id;
1240 index--;
1241 }
1242 prev = memory;
1243 }
1244 prev = obj;
1245 }
1246#endif
1247 type = __kmp_hwloc_type_2_topology_type(obj);
1248 if (type != KMP_HW_UNKNOWN) {
1249 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1250 if (included) {
1251 hw_thread.ids[index] = obj->logical_index;
1252 hw_thread.ids[index + 1] = sub_id;
1253 index--;
1254 }
1255 prev = obj;
1256 }
1257 }
1258 if (included)
1259 hw_thread_index++;
1260 }
1261 __kmp_topology->sort_ids();
1262 return true;
1263}
1264#endif // KMP_USE_HWLOC
1265
1266// If we don't know how to retrieve the machine's processor topology, or
1267// encounter an error in doing so, this routine is called to form a "flat"
1268// mapping of os thread id's <-> processor id's.
1269static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1270 *msg_id = kmp_i18n_null;
1271 int depth = 3;
1272 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1273
1274 if (__kmp_affinity_verbose) {
1275 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1276 }
1277
1278 // Even if __kmp_affinity_type == affinity_none, this routine might still
1279 // called to set __kmp_ncores, as well as
1280 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1281 if (!KMP_AFFINITY_CAPABLE()) {
1282 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1283 __kmp_ncores = nPackages = __kmp_xproc;
1284 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1285 return true;
1286 }
1287
1288 // When affinity is off, this routine will still be called to set
1289 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1290 // Make sure all these vars are set correctly, and return now if affinity is
1291 // not enabled.
1292 __kmp_ncores = nPackages = __kmp_avail_proc;
1293 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1294
1295 // Construct the data structure to be returned.
1296 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1297 int avail_ct = 0;
1298 int i;
1299 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1300 // Skip this proc if it is not included in the machine model.
1301 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1302 continue;
1303 }
1304 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
1305 hw_thread.clear();
1306 hw_thread.os_id = i;
1307 hw_thread.ids[0] = i;
1308 hw_thread.ids[1] = 0;
1309 hw_thread.ids[2] = 0;
1310 avail_ct++;
1311 }
1312 if (__kmp_affinity_verbose) {
1313 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
1314 }
1315 return true;
1316}
1317
1318#if KMP_GROUP_AFFINITY
1319// If multiple Windows* OS processor groups exist, we can create a 2-level
1320// topology map with the groups at level 0 and the individual procs at level 1.
1321// This facilitates letting the threads float among all procs in a group,
1322// if granularity=group (the default when there are multiple groups).
1323static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
1324 *msg_id = kmp_i18n_null;
1325 int depth = 3;
1326 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
1327 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
1328
1329 if (__kmp_affinity_verbose) {
1330 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
1331 }
1332
1333 // If we aren't affinity capable, then use flat topology
1334 if (!KMP_AFFINITY_CAPABLE()) {
1335 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1336 nPackages = __kmp_num_proc_groups;
1337 __kmp_nThreadsPerCore = 1;
1338 __kmp_ncores = __kmp_xproc;
1339 nCoresPerPkg = nPackages / __kmp_ncores;
1340 return true;
1341 }
1342
1343 // Construct the data structure to be returned.
1344 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1345 int avail_ct = 0;
1346 int i;
1347 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1348 // Skip this proc if it is not included in the machine model.
1349 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1350 continue;
1351 }
1352 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
1353 hw_thread.clear();
1354 hw_thread.os_id = i;
1355 hw_thread.ids[0] = i / BITS_PER_GROUP;
1356 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
1357 }
1358 return true;
1359}
1360#endif /* KMP_GROUP_AFFINITY */
1361
1362#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1363
1364template <kmp_uint32 LSB, kmp_uint32 MSB>
1365static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
1366 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
1367 const kmp_uint32 SHIFT_RIGHT = LSB;
1368 kmp_uint32 retval = v;
1369 retval <<= SHIFT_LEFT;
1370 retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
1371 return retval;
1372}
1373
1374static int __kmp_cpuid_mask_width(int count) {
1375 int r = 0;
1376
1377 while ((1 << r) < count)
1378 ++r;
1379 return r;
1380}
1381
1382class apicThreadInfo {
1383public:
1384 unsigned osId; // param to __kmp_affinity_bind_thread
1385 unsigned apicId; // from cpuid after binding
1386 unsigned maxCoresPerPkg; // ""
1387 unsigned maxThreadsPerPkg; // ""
1388 unsigned pkgId; // inferred from above values
1389 unsigned coreId; // ""
1390 unsigned threadId; // ""
1391};
1392
1393static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
1394 const void *b) {
1395 const apicThreadInfo *aa = (const apicThreadInfo *)a;
1396 const apicThreadInfo *bb = (const apicThreadInfo *)b;
1397 if (aa->pkgId < bb->pkgId)
1398 return -1;
1399 if (aa->pkgId > bb->pkgId)
1400 return 1;
1401 if (aa->coreId < bb->coreId)
1402 return -1;
1403 if (aa->coreId > bb->coreId)
1404 return 1;
1405 if (aa->threadId < bb->threadId)
1406 return -1;
1407 if (aa->threadId > bb->threadId)
1408 return 1;
1409 return 0;
1410}
1411
1412class kmp_cache_info_t {
1413public:
1414 struct info_t {
1415 unsigned level, mask;
1416 };
1417 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
1418 size_t get_depth() const { return depth; }
1419 info_t &operator[](size_t index) { return table[index]; }
1420 const info_t &operator[](size_t index) const { return table[index]; }
1421
1422 static kmp_hw_t get_topology_type(unsigned level) {
1423 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
1424 switch (level) {
1425 case 1:
1426 return KMP_HW_L1;
1427 case 2:
1428 return KMP_HW_L2;
1429 case 3:
1430 return KMP_HW_L3;
1431 }
1432 return KMP_HW_UNKNOWN;
1433 }
1434
1435private:
1436 static const int MAX_CACHE_LEVEL = 3;
1437
1438 size_t depth;
1439 info_t table[MAX_CACHE_LEVEL];
1440
1441 void get_leaf4_levels() {
1442 unsigned level = 0;
1443 while (depth < MAX_CACHE_LEVEL) {
1444 unsigned cache_type, max_threads_sharing;
1445 unsigned cache_level, cache_mask_width;
1446 kmp_cpuid buf2;
1447 __kmp_x86_cpuid(4, level, &buf2);
1448 cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
1449 if (!cache_type)
1450 break;
1451 // Skip instruction caches
1452 if (cache_type == 2) {
1453 level++;
1454 continue;
1455 }
1456 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
1457 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
1458 cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
1459 table[depth].level = cache_level;
1460 table[depth].mask = ((-1) << cache_mask_width);
1461 depth++;
1462 level++;
1463 }
1464 }
1465};
1466
1467// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
1468// an algorithm which cycles through the available os threads, setting
1469// the current thread's affinity mask to that thread, and then retrieves
1470// the Apic Id for each thread context using the cpuid instruction.
1471static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
1472 kmp_cpuid buf;
1473 *msg_id = kmp_i18n_null;
1474
1475 if (__kmp_affinity_verbose) {
1476 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
1477 }
1478
1479 // Check if cpuid leaf 4 is supported.
1480 __kmp_x86_cpuid(0, 0, &buf);
1481 if (buf.eax < 4) {
1482 *msg_id = kmp_i18n_str_NoLeaf4Support;
1483 return false;
1484 }
1485
1486 // The algorithm used starts by setting the affinity to each available thread
1487 // and retrieving info from the cpuid instruction, so if we are not capable of
1488 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1489 // need to do something else - use the defaults that we calculated from
1490 // issuing cpuid without binding to each proc.
1491 if (!KMP_AFFINITY_CAPABLE()) {
1492 // Hack to try and infer the machine topology using only the data
1493 // available from cpuid on the current thread, and __kmp_xproc.
1494 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1495
1496 // Get an upper bound on the number of threads per package using cpuid(1).
1497 // On some OS/chps combinations where HT is supported by the chip but is
1498 // disabled, this value will be 2 on a single core chip. Usually, it will be
1499 // 2 if HT is enabled and 1 if HT is disabled.
1500 __kmp_x86_cpuid(1, 0, &buf);
1501 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1502 if (maxThreadsPerPkg == 0) {
1503 maxThreadsPerPkg = 1;
1504 }
1505
1506 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
1507 // value.
1508 //
1509 // The author of cpu_count.cpp treated this only an upper bound on the
1510 // number of cores, but I haven't seen any cases where it was greater than
1511 // the actual number of cores, so we will treat it as exact in this block of
1512 // code.
1513 //
1514 // First, we need to check if cpuid(4) is supported on this chip. To see if
1515 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
1516 // greater.
1517 __kmp_x86_cpuid(0, 0, &buf);
1518 if (buf.eax >= 4) {
1519 __kmp_x86_cpuid(4, 0, &buf);
1520 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1521 } else {
1522 nCoresPerPkg = 1;
1523 }
1524
1525 // There is no way to reliably tell if HT is enabled without issuing the
1526 // cpuid instruction from every thread, can correlating the cpuid info, so
1527 // if the machine is not affinity capable, we assume that HT is off. We have
1528 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
1529 // does not support HT.
1530 //
1531 // - Older OSes are usually found on machines with older chips, which do not
1532 // support HT.
1533 // - The performance penalty for mistakenly identifying a machine as HT when
1534 // it isn't (which results in blocktime being incorrectly set to 0) is
1535 // greater than the penalty when for mistakenly identifying a machine as
1536 // being 1 thread/core when it is really HT enabled (which results in
1537 // blocktime being incorrectly set to a positive value).
1538 __kmp_ncores = __kmp_xproc;
1539 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1540 __kmp_nThreadsPerCore = 1;
1541 return true;
1542 }
1543
1544 // From here on, we can assume that it is safe to call
1545 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1546 // __kmp_affinity_type = affinity_none.
1547
1548 // Save the affinity mask for the current thread.
1549 kmp_affinity_raii_t previous_affinity;
1550
1551 // Run through each of the available contexts, binding the current thread
1552 // to it, and obtaining the pertinent information using the cpuid instr.
1553 //
1554 // The relevant information is:
1555 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
1556 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
1557 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
1558 // of this field determines the width of the core# + thread# fields in the
1559 // Apic Id. It is also an upper bound on the number of threads per
1560 // package, but it has been verified that situations happen were it is not
1561 // exact. In particular, on certain OS/chip combinations where Intel(R)
1562 // Hyper-Threading Technology is supported by the chip but has been
1563 // disabled, the value of this field will be 2 (for a single core chip).
1564 // On other OS/chip combinations supporting Intel(R) Hyper-Threading
1565 // Technology, the value of this field will be 1 when Intel(R)
1566 // Hyper-Threading Technology is disabled and 2 when it is enabled.
1567 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
1568 // of this field (+1) determines the width of the core# field in the Apic
1569 // Id. The comments in "cpucount.cpp" say that this value is an upper
1570 // bound, but the IA-32 architecture manual says that it is exactly the
1571 // number of cores per package, and I haven't seen any case where it
1572 // wasn't.
1573 //
1574 // From this information, deduce the package Id, core Id, and thread Id,
1575 // and set the corresponding fields in the apicThreadInfo struct.
1576 unsigned i;
1577 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1578 __kmp_avail_proc * sizeof(apicThreadInfo));
1579 unsigned nApics = 0;
1580 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1581 // Skip this proc if it is not included in the machine model.
1582 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1583 continue;
1584 }
1585 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1586
1587 __kmp_affinity_dispatch->bind_thread(i);
1588 threadInfo[nApics].osId = i;
1589
1590 // The apic id and max threads per pkg come from cpuid(1).
1591 __kmp_x86_cpuid(1, 0, &buf);
1592 if (((buf.edx >> 9) & 1) == 0) {
1593 __kmp_free(threadInfo);
1594 *msg_id = kmp_i18n_str_ApicNotPresent;
1595 return false;
1596 }
1597 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1598 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1599 if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1600 threadInfo[nApics].maxThreadsPerPkg = 1;
1601 }
1602
1603 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
1604 // value.
1605 //
1606 // First, we need to check if cpuid(4) is supported on this chip. To see if
1607 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
1608 // or greater.
1609 __kmp_x86_cpuid(0, 0, &buf);
1610 if (buf.eax >= 4) {
1611 __kmp_x86_cpuid(4, 0, &buf);
1612 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1613 } else {
1614 threadInfo[nApics].maxCoresPerPkg = 1;
1615 }
1616
1617 // Infer the pkgId / coreId / threadId using only the info obtained locally.
1618 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
1619 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1620
1621 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
1622 int widthT = widthCT - widthC;
1623 if (widthT < 0) {
1624 // I've never seen this one happen, but I suppose it could, if the cpuid
1625 // instruction on a chip was really screwed up. Make sure to restore the
1626 // affinity mask before the tail call.
1627 __kmp_free(threadInfo);
1628 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1629 return false;
1630 }
1631
1632 int maskC = (1 << widthC) - 1;
1633 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
1634
1635 int maskT = (1 << widthT) - 1;
1636 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
1637
1638 nApics++;
1639 }
1640
1641 // We've collected all the info we need.
1642 // Restore the old affinity mask for this thread.
1643 previous_affinity.restore();
1644
1645 // Sort the threadInfo table by physical Id.
1646 qsort(threadInfo, nApics, sizeof(*threadInfo),
1647 __kmp_affinity_cmp_apicThreadInfo_phys_id);
1648
1649 // The table is now sorted by pkgId / coreId / threadId, but we really don't
1650 // know the radix of any of the fields. pkgId's may be sparsely assigned among
1651 // the chips on a system. Although coreId's are usually assigned
1652 // [0 .. coresPerPkg-1] and threadId's are usually assigned
1653 // [0..threadsPerCore-1], we don't want to make any such assumptions.
1654 //
1655 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
1656 // total # packages) are at this point - we want to determine that now. We
1657 // only have an upper bound on the first two figures.
1658 //
1659 // We also perform a consistency check at this point: the values returned by
1660 // the cpuid instruction for any thread bound to a given package had better
1661 // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1662 nPackages = 1;
1663 nCoresPerPkg = 1;
1664 __kmp_nThreadsPerCore = 1;
1665 unsigned nCores = 1;
1666
1667 unsigned pkgCt = 1; // to determine radii
1668 unsigned lastPkgId = threadInfo[0].pkgId;
1669 unsigned coreCt = 1;
1670 unsigned lastCoreId = threadInfo[0].coreId;
1671 unsigned threadCt = 1;
1672 unsigned lastThreadId = threadInfo[0].threadId;
1673
1674 // intra-pkg consist checks
1675 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1676 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1677
1678 for (i = 1; i < nApics; i++) {
1679 if (threadInfo[i].pkgId != lastPkgId) {
1680 nCores++;
1681 pkgCt++;
1682 lastPkgId = threadInfo[i].pkgId;
1683 if ((int)coreCt > nCoresPerPkg)
1684 nCoresPerPkg = coreCt;
1685 coreCt = 1;
1686 lastCoreId = threadInfo[i].coreId;
1687 if ((int)threadCt > __kmp_nThreadsPerCore)
1688 __kmp_nThreadsPerCore = threadCt;
1689 threadCt = 1;
1690 lastThreadId = threadInfo[i].threadId;
1691
1692 // This is a different package, so go on to the next iteration without
1693 // doing any consistency checks. Reset the consistency check vars, though.
1694 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1695 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1696 continue;
1697 }
1698
1699 if (threadInfo[i].coreId != lastCoreId) {
1700 nCores++;
1701 coreCt++;
1702 lastCoreId = threadInfo[i].coreId;
1703 if ((int)threadCt > __kmp_nThreadsPerCore)
1704 __kmp_nThreadsPerCore = threadCt;
1705 threadCt = 1;
1706 lastThreadId = threadInfo[i].threadId;
1707 } else if (threadInfo[i].threadId != lastThreadId) {
1708 threadCt++;
1709 lastThreadId = threadInfo[i].threadId;
1710 } else {
1711 __kmp_free(threadInfo);
1712 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1713 return false;
1714 }
1715
1716 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1717 // fields agree between all the threads bounds to a given package.
1718 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
1719 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1720 __kmp_free(threadInfo);
1721 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1722 return false;
1723 }
1724 }
1725 // When affinity is off, this routine will still be called to set
1726 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1727 // Make sure all these vars are set correctly
1728 nPackages = pkgCt;
1729 if ((int)coreCt > nCoresPerPkg)
1730 nCoresPerPkg = coreCt;
1731 if ((int)threadCt > __kmp_nThreadsPerCore)
1732 __kmp_nThreadsPerCore = threadCt;
1733 __kmp_ncores = nCores;
1734 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
1735
1736 // Now that we've determined the number of packages, the number of cores per
1737 // package, and the number of threads per core, we can construct the data
1738 // structure that is to be returned.
1739 int idx = 0;
1740 int pkgLevel = 0;
1741 int coreLevel = 1;
1742 int threadLevel = 2;
1743 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1744 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1745 kmp_hw_t types[3];
1746 if (pkgLevel >= 0)
1747 types[idx++] = KMP_HW_SOCKET;
1748 if (coreLevel >= 0)
1749 types[idx++] = KMP_HW_CORE;
1750 if (threadLevel >= 0)
1751 types[idx++] = KMP_HW_THREAD;
1752
1753 KMP_ASSERT(depth > 0);
1754 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
1755
1756 for (i = 0; i < nApics; ++i) {
1757 idx = 0;
1758 unsigned os = threadInfo[i].osId;
1759 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
1760 hw_thread.clear();
1761
1762 if (pkgLevel >= 0) {
1763 hw_thread.ids[idx++] = threadInfo[i].pkgId;
1764 }
1765 if (coreLevel >= 0) {
1766 hw_thread.ids[idx++] = threadInfo[i].coreId;
1767 }
1768 if (threadLevel >= 0) {
1769 hw_thread.ids[idx++] = threadInfo[i].threadId;
1770 }
1771 hw_thread.os_id = os;
1772 }
1773
1774 __kmp_free(threadInfo);
1775 __kmp_topology->sort_ids();
1776 if (!__kmp_topology->check_ids()) {
1777 kmp_topology_t::deallocate(__kmp_topology);
1778 __kmp_topology = nullptr;
1779 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1780 return false;
1781 }
1782 return true;
1783}
1784
1785// Intel(R) microarchitecture code name Nehalem, Dunnington and later
1786// architectures support a newer interface for specifying the x2APIC Ids,
1787// based on CPUID.B or CPUID.1F
1788/*
1789 * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
1790 Bits Bits Bits Bits
1791 31-16 15-8 7-4 4-0
1792---+-----------+--------------+-------------+-----------------+
1793EAX| reserved | reserved | reserved | Bits to Shift |
1794---+-----------|--------------+-------------+-----------------|
1795EBX| reserved | Num logical processors at level (16 bits) |
1796---+-----------|--------------+-------------------------------|
1797ECX| reserved | Level Type | Level Number (8 bits) |
1798---+-----------+--------------+-------------------------------|
1799EDX| X2APIC ID (32 bits) |
1800---+----------------------------------------------------------+
1801*/
1802
1803enum {
1804 INTEL_LEVEL_TYPE_INVALID = 0, // Package level
1805 INTEL_LEVEL_TYPE_SMT = 1,
1806 INTEL_LEVEL_TYPE_CORE = 2,
1807 INTEL_LEVEL_TYPE_TILE = 3,
1808 INTEL_LEVEL_TYPE_MODULE = 4,
1809 INTEL_LEVEL_TYPE_DIE = 5,
1810 INTEL_LEVEL_TYPE_LAST = 6,
1811};
1812
1813struct cpuid_level_info_t {
1814 unsigned level_type, mask, mask_width, nitems, cache_mask;
1815};
1816
1817static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
1818 switch (intel_type) {
1819 case INTEL_LEVEL_TYPE_INVALID:
1820 return KMP_HW_SOCKET;
1821 case INTEL_LEVEL_TYPE_SMT:
1822 return KMP_HW_THREAD;
1823 case INTEL_LEVEL_TYPE_CORE:
1824 return KMP_HW_CORE;
1825 case INTEL_LEVEL_TYPE_TILE:
1826 return KMP_HW_TILE;
1827 case INTEL_LEVEL_TYPE_MODULE:
1828 return KMP_HW_MODULE;
1829 case INTEL_LEVEL_TYPE_DIE:
1830 return KMP_HW_DIE;
1831 }
1832 return KMP_HW_UNKNOWN;
1833}
1834
1835// This function takes the topology leaf, a levels array to store the levels
1836// detected and a bitmap of the known levels.
1837// Returns the number of levels in the topology
1838static unsigned
1839__kmp_x2apicid_get_levels(int leaf,
1840 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
1841 kmp_uint64 known_levels) {
1842 unsigned level, levels_index;
1843 unsigned level_type, mask_width, nitems;
1844 kmp_cpuid buf;
1845
1846 // New algorithm has known topology layers act as highest unknown topology
1847 // layers when unknown topology layers exist.
1848 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
1849 // are unknown topology layers, Then SMT will take the characteristics of
1850 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
1851 // This eliminates unknown portions of the topology while still keeping the
1852 // correct structure.
1853 level = levels_index = 0;
1854 do {
1855 __kmp_x86_cpuid(leaf, level, &buf);
1856 level_type = __kmp_extract_bits<8, 15>(buf.ecx);
1857 mask_width = __kmp_extract_bits<0, 4>(buf.eax);
1858 nitems = __kmp_extract_bits<0, 15>(buf.ebx);
1859 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
1860 return 0;
1861
1862 if (known_levels & (1ull << level_type)) {
1863 // Add a new level to the topology
1864 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
1865 levels[levels_index].level_type = level_type;
1866 levels[levels_index].mask_width = mask_width;
1867 levels[levels_index].nitems = nitems;
1868 levels_index++;
1869 } else {
1870 // If it is an unknown level, then logically move the previous layer up
1871 if (levels_index > 0) {
1872 levels[levels_index - 1].mask_width = mask_width;
1873 levels[levels_index - 1].nitems = nitems;
1874 }
1875 }
1876 level++;
1877 } while (level_type != INTEL_LEVEL_TYPE_INVALID);
1878
1879 // Set the masks to & with apicid
1880 for (unsigned i = 0; i < levels_index; ++i) {
1881 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
1882 levels[i].mask = ~((-1) << levels[i].mask_width);
1883 levels[i].cache_mask = (-1) << levels[i].mask_width;
1884 for (unsigned j = 0; j < i; ++j)
1885 levels[i].mask ^= levels[j].mask;
1886 } else {
1887 KMP_DEBUG_ASSERT(levels_index > 0);
1888 levels[i].mask = (-1) << levels[i - 1].mask_width;
1889 levels[i].cache_mask = 0;
1890 }
1891 }
1892 return levels_index;
1893}
1894
1895static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
1896
1897 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
1898 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
1899 unsigned levels_index;
1900 kmp_cpuid buf;
1901 kmp_uint64 known_levels;
1902 int topology_leaf, highest_leaf, apic_id;
1903 int num_leaves;
1904 static int leaves[] = {0, 0};
1905
1906 kmp_i18n_id_t leaf_message_id;
1907
1908 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
1909
1910 *msg_id = kmp_i18n_null;
1911 if (__kmp_affinity_verbose) {
1912 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
1913 }
1914
1915 // Figure out the known topology levels
1916 known_levels = 0ull;
1917 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
1918 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
1919 known_levels |= (1ull << i);
1920 }
1921 }
1922
1923 // Get the highest cpuid leaf supported
1924 __kmp_x86_cpuid(0, 0, &buf);
1925 highest_leaf = buf.eax;
1926
1927 // If a specific topology method was requested, only allow that specific leaf
1928 // otherwise, try both leaves 31 and 11 in that order
1929 num_leaves = 0;
1930 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
1931 num_leaves = 1;
1932 leaves[0] = 11;
1933 leaf_message_id = kmp_i18n_str_NoLeaf11Support;
1934 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
1935 num_leaves = 1;
1936 leaves[0] = 31;
1937 leaf_message_id = kmp_i18n_str_NoLeaf31Support;
1938 } else {
1939 num_leaves = 2;
1940 leaves[0] = 31;
1941 leaves[1] = 11;
1942 leaf_message_id = kmp_i18n_str_NoLeaf11Support;
1943 }
1944
1945 // Check to see if cpuid leaf 31 or 11 is supported.
1946 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1947 topology_leaf = -1;
1948 for (int i = 0; i < num_leaves; ++i) {
1949 int leaf = leaves[i];
1950 if (highest_leaf < leaf)
1951 continue;
1952 __kmp_x86_cpuid(leaf, 0, &buf);
1953 if (buf.ebx == 0)
1954 continue;
1955 topology_leaf = leaf;
1956 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
1957 if (levels_index == 0)
1958 continue;
1959 break;
1960 }
1961 if (topology_leaf == -1 || levels_index == 0) {
1962 *msg_id = leaf_message_id;
1963 return false;
1964 }
1965 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
1966
1967 // The algorithm used starts by setting the affinity to each available thread
1968 // and retrieving info from the cpuid instruction, so if we are not capable of
1969 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
1970 // we need to do something else - use the defaults that we calculated from
1971 // issuing cpuid without binding to each proc.
1972 if (!KMP_AFFINITY_CAPABLE()) {
1973 // Hack to try and infer the machine topology using only the data
1974 // available from cpuid on the current thread, and __kmp_xproc.
1975 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1976 for (unsigned i = 0; i < levels_index; ++i) {
1977 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
1978 __kmp_nThreadsPerCore = levels[i].nitems;
1979 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
1980 nCoresPerPkg = levels[i].nitems;
1981 }
1982 }
1983 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1984 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1985 return true;
1986 }
1987
1988 // Allocate the data structure to be returned.
1989 int depth = levels_index;
1990 for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
1991 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
1992 __kmp_topology =
1993 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
1994
1995 // Insert equivalent cache types if they exist
1996 kmp_cache_info_t cache_info;
1997 for (size_t i = 0; i < cache_info.get_depth(); ++i) {
1998 const kmp_cache_info_t::info_t &info = cache_info[i];
1999 unsigned cache_mask = info.mask;
2000 unsigned cache_level = info.level;
2001 for (unsigned j = 0; j < levels_index; ++j) {
2002 unsigned hw_cache_mask = levels[j].cache_mask;
2003 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2004 if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2005 kmp_hw_t type =
2006 __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2007 __kmp_topology->set_equivalent_type(cache_type, type);
2008 }
2009 }
2010 }
2011
2012 // From here on, we can assume that it is safe to call
2013 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2014 // __kmp_affinity_type = affinity_none.
2015
2016 // Save the affinity mask for the current thread.
2017 kmp_affinity_raii_t previous_affinity;
2018
2019 // Run through each of the available contexts, binding the current thread
2020 // to it, and obtaining the pertinent information using the cpuid instr.
2021 unsigned int proc;
2022 int hw_thread_index = 0;
2023 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2024 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2025 unsigned my_levels_index;
2026
2027 // Skip this proc if it is not included in the machine model.
2028 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2029 continue;
2030 }
2031 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2032
2033 __kmp_affinity_dispatch->bind_thread(proc);
2034
2035 // New algorithm
2036 __kmp_x86_cpuid(topology_leaf, 0, &buf);
2037 apic_id = buf.edx;
2038 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2039 my_levels_index =
2040 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2041 if (my_levels_index == 0 || my_levels_index != levels_index) {
2042 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2043 return false;
2044 }
2045 hw_thread.clear();
2046 hw_thread.os_id = proc;
2047 // Put in topology information
2048 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2049 hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2050 if (j > 0) {
2051 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2052 }
2053 }
2054 hw_thread_index++;
2055 }
2056 KMP_ASSERT(hw_thread_index > 0);
2057 __kmp_topology->sort_ids();
2058 if (!__kmp_topology->check_ids()) {
2059 kmp_topology_t::deallocate(__kmp_topology);
2060 __kmp_topology = nullptr;
2061 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2062 return false;
2063 }
2064 return true;
2065}
2066#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2067
2068#define osIdIndex 0
2069#define threadIdIndex 1
2070#define coreIdIndex 2
2071#define pkgIdIndex 3
2072#define nodeIdIndex 4
2073
2074typedef unsigned *ProcCpuInfo;
2075static unsigned maxIndex = pkgIdIndex;
2076
2077static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2078 const void *b) {
2079 unsigned i;
2080 const unsigned *aa = *(unsigned *const *)a;
2081 const unsigned *bb = *(unsigned *const *)b;
2082 for (i = maxIndex;; i--) {
2083 if (aa[i] < bb[i])
2084 return -1;
2085 if (aa[i] > bb[i])
2086 return 1;
2087 if (i == osIdIndex)
2088 break;
2089 }
2090 return 0;
2091}
2092
2093#if KMP_USE_HIER_SCHED
2094// Set the array sizes for the hierarchy layers
2095static void __kmp_dispatch_set_hierarchy_values() {
2096 // Set the maximum number of L1's to number of cores
2097 // Set the maximum number of L2's to to either number of cores / 2 for
2098 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2099 // Or the number of cores for Intel(R) Xeon(R) processors
2100 // Set the maximum number of NUMA nodes and L3's to number of packages
2101 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2102 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2103 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2104#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2105 KMP_MIC_SUPPORTED
2106 if (__kmp_mic_type >= mic3)
2107 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2108 else
2109#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2110 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2111 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2112 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2113 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2114 // Set the number of threads per unit
2115 // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2116 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2117 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2118 __kmp_nThreadsPerCore;
2119#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2120 KMP_MIC_SUPPORTED
2121 if (__kmp_mic_type >= mic3)
2122 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2123 2 * __kmp_nThreadsPerCore;
2124 else
2125#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2126 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2127 __kmp_nThreadsPerCore;
2128 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2129 nCoresPerPkg * __kmp_nThreadsPerCore;
2130 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2131 nCoresPerPkg * __kmp_nThreadsPerCore;
2132 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2133 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2134}
2135
2136// Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2137// i.e., this thread's L1 or this thread's L2, etc.
2138int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2139 int index = type + 1;
2140 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2141 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2142 if (type == kmp_hier_layer_e::LAYER_THREAD)
2143 return tid;
2144 else if (type == kmp_hier_layer_e::LAYER_LOOP)
2145 return 0;
2146 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2147 if (tid >= num_hw_threads)
2148 tid = tid % num_hw_threads;
2149 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2150}
2151
2152// Return the number of t1's per t2
2153int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2154 int i1 = t1 + 1;
2155 int i2 = t2 + 1;
2156 KMP_DEBUG_ASSERT(i1 <= i2);
2157 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2158 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2159 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2160 // (nthreads/t2) / (nthreads/t1) = t1 / t2
2161 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2162}
2163#endif // KMP_USE_HIER_SCHED
2164
2165static inline const char *__kmp_cpuinfo_get_filename() {
2166 const char *filename;
2167 if (__kmp_cpuinfo_file != nullptr)
2168 filename = __kmp_cpuinfo_file;
2169 else
2170 filename = "/proc/cpuinfo";
2171 return filename;
2172}
2173
2174static inline const char *__kmp_cpuinfo_get_envvar() {
2175 const char *envvar = nullptr;
2176 if (__kmp_cpuinfo_file != nullptr)
2177 envvar = "KMP_CPUINFO_FILE";
2178 return envvar;
2179}
2180
2181// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2182// affinity map.
2183static bool __kmp_affinity_create_cpuinfo_map(int *line,
2184 kmp_i18n_id_t *const msg_id) {
2185 const char *filename = __kmp_cpuinfo_get_filename();
2186 const char *envvar = __kmp_cpuinfo_get_envvar();
2187 *msg_id = kmp_i18n_null;
2188
2189 if (__kmp_affinity_verbose) {
2190 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2191 }
2192
2193 kmp_safe_raii_file_t f(filename, "r", envvar);
2194
2195 // Scan of the file, and count the number of "processor" (osId) fields,
2196 // and find the highest value of <n> for a node_<n> field.
2197 char buf[256];
2198 unsigned num_records = 0;
2199 while (!feof(f)) {
2200 buf[sizeof(buf) - 1] = 1;
2201 if (!fgets(buf, sizeof(buf), f)) {
2202 // Read errors presumably because of EOF
2203 break;
2204 }
2205
2206 char s1[] = "processor";
2207 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2208 num_records++;
2209 continue;
2210 }
2211
2212 // FIXME - this will match "node_<n> <garbage>"
2213 unsigned level;
2214 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2215 // validate the input fisrt:
2216 if (level > (unsigned)__kmp_xproc) { // level is too big
2217 level = __kmp_xproc;
2218 }
2219 if (nodeIdIndex + level >= maxIndex) {
2220 maxIndex = nodeIdIndex + level;
2221 }
2222 continue;
2223 }
2224 }
2225
2226 // Check for empty file / no valid processor records, or too many. The number
2227 // of records can't exceed the number of valid bits in the affinity mask.
2228 if (num_records == 0) {
2229 *msg_id = kmp_i18n_str_NoProcRecords;
2230 return false;
2231 }
2232 if (num_records > (unsigned)__kmp_xproc) {
2233 *msg_id = kmp_i18n_str_TooManyProcRecords;
2234 return false;
2235 }
2236
2237 // Set the file pointer back to the beginning, so that we can scan the file
2238 // again, this time performing a full parse of the data. Allocate a vector of
2239 // ProcCpuInfo object, where we will place the data. Adding an extra element
2240 // at the end allows us to remove a lot of extra checks for termination
2241 // conditions.
2242 if (fseek(f, 0, SEEK_SET) != 0) {
2243 *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2244 return false;
2245 }
2246
2247 // Allocate the array of records to store the proc info in. The dummy
2248 // element at the end makes the logic in filling them out easier to code.
2249 unsigned **threadInfo =
2250 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2251 unsigned i;
2252 for (i = 0; i <= num_records; i++) {
2253 threadInfo[i] =
2254 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2255 }
2256
2257#define CLEANUP_THREAD_INFO \
2258 for (i = 0; i <= num_records; i++) { \
2259 __kmp_free(threadInfo[i]); \
2260 } \
2261 __kmp_free(threadInfo);
2262
2263 // A value of UINT_MAX means that we didn't find the field
2264 unsigned __index;
2265
2266#define INIT_PROC_INFO(p) \
2267 for (__index = 0; __index <= maxIndex; __index++) { \
2268 (p)[__index] = UINT_MAX; \
2269 }
2270
2271 for (i = 0; i <= num_records; i++) {
2272 INIT_PROC_INFO(threadInfo[i]);
2273 }
2274
2275 unsigned num_avail = 0;
2276 *line = 0;
2277 while (!feof(f)) {
2278 // Create an inner scoping level, so that all the goto targets at the end of
2279 // the loop appear in an outer scoping level. This avoids warnings about
2280 // jumping past an initialization to a target in the same block.
2281 {
2282 buf[sizeof(buf) - 1] = 1;
2283 bool long_line = false;
2284 if (!fgets(buf, sizeof(buf), f)) {
2285 // Read errors presumably because of EOF
2286 // If there is valid data in threadInfo[num_avail], then fake
2287 // a blank line in ensure that the last address gets parsed.
2288 bool valid = false;
2289 for (i = 0; i <= maxIndex; i++) {
2290 if (threadInfo[num_avail][i] != UINT_MAX) {
2291 valid = true;
2292 }
2293 }
2294 if (!valid) {
2295 break;
2296 }
2297 buf[0] = 0;
2298 } else if (!buf[sizeof(buf) - 1]) {
2299 // The line is longer than the buffer. Set a flag and don't
2300 // emit an error if we were going to ignore the line, anyway.
2301 long_line = true;
2302
2303#define CHECK_LINE \
2304 if (long_line) { \
2305 CLEANUP_THREAD_INFO; \
2306 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2307 return false; \
2308 }
2309 }
2310 (*line)++;
2311
2312 char s1[] = "processor";
2313 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2314 CHECK_LINE;
2315 char *p = strchr(buf + sizeof(s1) - 1, ':');
2316 unsigned val;
2317 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2318 goto no_val;
2319 if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
2320#if KMP_ARCH_AARCH64
2321 // Handle the old AArch64 /proc/cpuinfo layout differently,
2322 // it contains all of the 'processor' entries listed in a
2323 // single 'Processor' section, therefore the normal looking
2324 // for duplicates in that section will always fail.
2325 num_avail++;
2326#else
2327 goto dup_field;
2328#endif
2329 threadInfo[num_avail][osIdIndex] = val;
2330#if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
2331 char path[256];
2332 KMP_SNPRINTF(
2333 path, sizeof(path),
2334 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2335 threadInfo[num_avail][osIdIndex]);
2336 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2337
2338 KMP_SNPRINTF(path, sizeof(path),
2339 "/sys/devices/system/cpu/cpu%u/topology/core_id",
2340 threadInfo[num_avail][osIdIndex]);
2341 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2342 continue;
2343#else
2344 }
2345 char s2[] = "physical id";
2346 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2347 CHECK_LINE;
2348 char *p = strchr(buf + sizeof(s2) - 1, ':');
2349 unsigned val;
2350 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2351 goto no_val;
2352 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
2353 goto dup_field;
2354 threadInfo[num_avail][pkgIdIndex] = val;
2355 continue;
2356 }
2357 char s3[] = "core id";
2358 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2359 CHECK_LINE;
2360 char *p = strchr(buf + sizeof(s3) - 1, ':');
2361 unsigned val;
2362 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2363 goto no_val;
2364 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
2365 goto dup_field;
2366 threadInfo[num_avail][coreIdIndex] = val;
2367 continue;
2368#endif // KMP_OS_LINUX && USE_SYSFS_INFO
2369 }
2370 char s4[] = "thread id";
2371 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2372 CHECK_LINE;
2373 char *p = strchr(buf + sizeof(s4) - 1, ':');
2374 unsigned val;
2375 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2376 goto no_val;
2377 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
2378 goto dup_field;
2379 threadInfo[num_avail][threadIdIndex] = val;
2380 continue;
2381 }
2382 unsigned level;
2383 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2384 CHECK_LINE;
2385 char *p = strchr(buf + sizeof(s4) - 1, ':');
2386 unsigned val;
2387 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2388 goto no_val;
2389 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2390 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
2391 goto dup_field;
2392 threadInfo[num_avail][nodeIdIndex + level] = val;
2393 continue;
2394 }
2395
2396 // We didn't recognize the leading token on the line. There are lots of
2397 // leading tokens that we don't recognize - if the line isn't empty, go on
2398 // to the next line.
2399 if ((*buf != 0) && (*buf != '\n')) {
2400 // If the line is longer than the buffer, read characters
2401 // until we find a newline.
2402 if (long_line) {
2403 int ch;
2404 while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
2405 ;
2406 }
2407 continue;
2408 }
2409
2410 // A newline has signalled the end of the processor record.
2411 // Check that there aren't too many procs specified.
2412 if ((int)num_avail == __kmp_xproc) {
2413 CLEANUP_THREAD_INFO;
2414 *msg_id = kmp_i18n_str_TooManyEntries;
2415 return false;
2416 }
2417
2418 // Check for missing fields. The osId field must be there, and we
2419 // currently require that the physical id field is specified, also.
2420 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2421 CLEANUP_THREAD_INFO;
2422 *msg_id = kmp_i18n_str_MissingProcField;
2423 return false;
2424 }
2425 if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2426 CLEANUP_THREAD_INFO;
2427 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2428 return false;
2429 }
2430
2431 // Skip this proc if it is not included in the machine model.
2432 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
2433 __kmp_affin_fullMask)) {
2434 INIT_PROC_INFO(threadInfo[num_avail]);
2435 continue;
2436 }
2437
2438 // We have a successful parse of this proc's info.
2439 // Increment the counter, and prepare for the next proc.
2440 num_avail++;
2441 KMP_ASSERT(num_avail <= num_records);
2442 INIT_PROC_INFO(threadInfo[num_avail]);
2443 }
2444 continue;
2445
2446 no_val:
2447 CLEANUP_THREAD_INFO;
2448 *msg_id = kmp_i18n_str_MissingValCpuinfo;
2449 return false;
2450
2451 dup_field:
2452 CLEANUP_THREAD_INFO;
2453 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2454 return false;
2455 }
2456 *line = 0;
2457
2458#if KMP_MIC && REDUCE_TEAM_SIZE
2459 unsigned teamSize = 0;
2460#endif // KMP_MIC && REDUCE_TEAM_SIZE
2461
2462 // check for num_records == __kmp_xproc ???
2463
2464 // If it is configured to omit the package level when there is only a single
2465 // package, the logic at the end of this routine won't work if there is only a
2466 // single thread
2467 KMP_ASSERT(num_avail > 0);
2468 KMP_ASSERT(num_avail <= num_records);
2469
2470 // Sort the threadInfo table by physical Id.
2471 qsort(threadInfo, num_avail, sizeof(*threadInfo),
2472 __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2473
2474 // The table is now sorted by pkgId / coreId / threadId, but we really don't
2475 // know the radix of any of the fields. pkgId's may be sparsely assigned among
2476 // the chips on a system. Although coreId's are usually assigned
2477 // [0 .. coresPerPkg-1] and threadId's are usually assigned
2478 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2479 //
2480 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2481 // total # packages) are at this point - we want to determine that now. We
2482 // only have an upper bound on the first two figures.
2483 unsigned *counts =
2484 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2485 unsigned *maxCt =
2486 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2487 unsigned *totals =
2488 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2489 unsigned *lastId =
2490 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2491
2492 bool assign_thread_ids = false;
2493 unsigned threadIdCt;
2494 unsigned index;
2495
2496restart_radix_check:
2497 threadIdCt = 0;
2498
2499 // Initialize the counter arrays with data from threadInfo[0].
2500 if (assign_thread_ids) {
2501 if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2502 threadInfo[0][threadIdIndex] = threadIdCt++;
2503 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2504 threadIdCt = threadInfo[0][threadIdIndex] + 1;
2505 }
2506 }
2507 for (index = 0; index <= maxIndex; index++) {
2508 counts[index] = 1;
2509 maxCt[index] = 1;
2510 totals[index] = 1;
2511 lastId[index] = threadInfo[0][index];
2512 ;
2513 }
2514
2515 // Run through the rest of the OS procs.
2516 for (i = 1; i < num_avail; i++) {
2517 // Find the most significant index whose id differs from the id for the
2518 // previous OS proc.
2519 for (index = maxIndex; index >= threadIdIndex; index--) {
2520 if (assign_thread_ids && (index == threadIdIndex)) {
2521 // Auto-assign the thread id field if it wasn't specified.
2522 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2523 threadInfo[i][threadIdIndex] = threadIdCt++;
2524 }
2525 // Apparently the thread id field was specified for some entries and not
2526 // others. Start the thread id counter off at the next higher thread id.
2527 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2528 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2529 }
2530 }
2531 if (threadInfo[i][index] != lastId[index]) {
2532 // Run through all indices which are less significant, and reset the
2533 // counts to 1. At all levels up to and including index, we need to
2534 // increment the totals and record the last id.
2535 unsigned index2;
2536 for (index2 = threadIdIndex; index2 < index; index2++) {
2537 totals[index2]++;
2538 if (counts[index2] > maxCt[index2]) {
2539 maxCt[index2] = counts[index2];
2540 }
2541 counts[index2] = 1;
2542 lastId[index2] = threadInfo[i][index2];
2543 }
2544 counts[index]++;
2545 totals[index]++;
2546 lastId[index] = threadInfo[i][index];
2547
2548 if (assign_thread_ids && (index > threadIdIndex)) {
2549
2550#if KMP_MIC && REDUCE_TEAM_SIZE
2551 // The default team size is the total #threads in the machine
2552 // minus 1 thread for every core that has 3 or more threads.
2553 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2554#endif // KMP_MIC && REDUCE_TEAM_SIZE
2555
2556 // Restart the thread counter, as we are on a new core.
2557 threadIdCt = 0;
2558
2559 // Auto-assign the thread id field if it wasn't specified.
2560 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2561 threadInfo[i][threadIdIndex] = threadIdCt++;
2562 }
2563
2564 // Apparently the thread id field was specified for some entries and
2565 // not others. Start the thread id counter off at the next higher
2566 // thread id.
2567 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2568 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2569 }
2570 }
2571 break;
2572 }
2573 }
2574 if (index < threadIdIndex) {
2575 // If thread ids were specified, it is an error if they are not unique.
2576 // Also, check that we waven't already restarted the loop (to be safe -
2577 // shouldn't need to).
2578 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
2579 __kmp_free(lastId);
2580 __kmp_free(totals);
2581 __kmp_free(maxCt);
2582 __kmp_free(counts);
2583 CLEANUP_THREAD_INFO;
2584 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2585 return false;
2586 }
2587
2588 // If the thread ids were not specified and we see entries entries that
2589 // are duplicates, start the loop over and assign the thread ids manually.
2590 assign_thread_ids = true;
2591 goto restart_radix_check;
2592 }
2593 }
2594
2595#if KMP_MIC && REDUCE_TEAM_SIZE
2596 // The default team size is the total #threads in the machine
2597 // minus 1 thread for every core that has 3 or more threads.
2598 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2599#endif // KMP_MIC && REDUCE_TEAM_SIZE
2600
2601 for (index = threadIdIndex; index <= maxIndex; index++) {
2602 if (counts[index] > maxCt[index]) {
2603 maxCt[index] = counts[index];
2604 }
2605 }
2606
2607 __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2608 nCoresPerPkg = maxCt[coreIdIndex];
2609 nPackages = totals[pkgIdIndex];
2610
2611 // When affinity is off, this routine will still be called to set
2612 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2613 // Make sure all these vars are set correctly, and return now if affinity is
2614 // not enabled.
2615 __kmp_ncores = totals[coreIdIndex];
2616 if (!KMP_AFFINITY_CAPABLE()) {
2617 KMP_ASSERT(__kmp_affinity_type == affinity_none);
2618 return true;
2619 }
2620
2621#if KMP_MIC && REDUCE_TEAM_SIZE
2622 // Set the default team size.
2623 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2624 __kmp_dflt_team_nth = teamSize;
2625 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
2626 "__kmp_dflt_team_nth = %d\n",
2627 __kmp_dflt_team_nth));
2628 }
2629#endif // KMP_MIC && REDUCE_TEAM_SIZE
2630
2631 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
2632
2633 // Count the number of levels which have more nodes at that level than at the
2634 // parent's level (with there being an implicit root node of the top level).
2635 // This is equivalent to saying that there is at least one node at this level
2636 // which has a sibling. These levels are in the map, and the package level is
2637 // always in the map.
2638 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2639 for (index = threadIdIndex; index < maxIndex; index++) {
2640 KMP_ASSERT(totals[index] >= totals[index + 1]);
2641 inMap[index] = (totals[index] > totals[index + 1]);
2642 }
2643 inMap[maxIndex] = (totals[maxIndex] > 1);
2644 inMap[pkgIdIndex] = true;
2645 inMap[coreIdIndex] = true;
2646 inMap[threadIdIndex] = true;
2647
2648 int depth = 0;
2649 int idx = 0;
2650 kmp_hw_t types[KMP_HW_LAST];
2651 int pkgLevel = -1;
2652 int coreLevel = -1;
2653 int threadLevel = -1;
2654 for (index = threadIdIndex; index <= maxIndex; index++) {
2655 if (inMap[index]) {
2656 depth++;
2657 }
2658 }
2659 if (inMap[pkgIdIndex]) {
2660 pkgLevel = idx;
2661 types[idx++] = KMP_HW_SOCKET;
2662 }
2663 if (inMap[coreIdIndex]) {
2664 coreLevel = idx;
2665 types[idx++] = KMP_HW_CORE;
2666 }
2667 if (inMap[threadIdIndex]) {
2668 threadLevel = idx;
2669 types[idx++] = KMP_HW_THREAD;
2670 }
2671 KMP_ASSERT(depth > 0);
2672
2673 // Construct the data structure that is to be returned.
2674 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
2675
2676 for (i = 0; i < num_avail; ++i) {
2677 unsigned os = threadInfo[i][osIdIndex];
2678 int src_index;
2679 int dst_index = 0;
2680 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2681 hw_thread.clear();
2682 hw_thread.os_id = os;
2683
2684 idx = 0;
2685 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2686 if (!inMap[src_index]) {
2687 continue;
2688 }
2689 if (src_index == pkgIdIndex) {
2690 hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
2691 } else if (src_index == coreIdIndex) {
2692 hw_thread.ids[coreLevel] = threadInfo[i][src_index];
2693 } else if (src_index == threadIdIndex) {
2694 hw_thread.ids[threadLevel] = threadInfo[i][src_index];
2695 }
2696 dst_index++;
2697 }
2698 }
2699
2700 __kmp_free(inMap);
2701 __kmp_free(lastId);
2702 __kmp_free(totals);
2703 __kmp_free(maxCt);
2704 __kmp_free(counts);
2705 CLEANUP_THREAD_INFO;
2706 __kmp_topology->sort_ids();
2707 if (!__kmp_topology->check_ids()) {
2708 kmp_topology_t::deallocate(__kmp_topology);
2709 __kmp_topology = nullptr;
2710 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2711 return false;
2712 }
2713 return true;
2714}
2715
2716// Create and return a table of affinity masks, indexed by OS thread ID.
2717// This routine handles OR'ing together all the affinity masks of threads
2718// that are sufficiently close, if granularity > fine.
2719static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
2720 unsigned *numUnique) {
2721 // First form a table of affinity masks in order of OS thread id.
2722 int maxOsId;
2723 int i;
2724 int numAddrs = __kmp_topology->get_num_hw_threads();
2725 int depth = __kmp_topology->get_depth();
2726 KMP_ASSERT(numAddrs);
2727 KMP_ASSERT(depth);
2728
2729 maxOsId = 0;
2730 for (i = numAddrs - 1;; --i) {
2731 int osId = __kmp_topology->at(i).os_id;
2732 if (osId > maxOsId) {
2733 maxOsId = osId;
2734 }
2735 if (i == 0)
2736 break;
2737 }
2738 kmp_affin_mask_t *osId2Mask;
2739 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
2740 KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2741 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2742 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2743 }
2744 if (__kmp_affinity_gran_levels >= (int)depth) {
2745 if (__kmp_affinity_verbose ||
2746 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
2747 KMP_WARNING(AffThreadsMayMigrate);
2748 }
2749 }
2750
2751 // Run through the table, forming the masks for all threads on each core.
2752 // Threads on the same core will have identical kmp_hw_thread_t objects, not
2753 // considering the last level, which must be the thread id. All threads on a
2754 // core will appear consecutively.
2755 int unique = 0;
2756 int j = 0; // index of 1st thread on core
2757 int leader = 0;
2758 kmp_affin_mask_t *sum;
2759 KMP_CPU_ALLOC_ON_STACK(sum);
2760 KMP_CPU_ZERO(sum);
2761 KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
2762 for (i = 1; i < numAddrs; i++) {
2763 // If this thread is sufficiently close to the leader (within the
2764 // granularity setting), then set the bit for this os thread in the
2765 // affinity mask for this group, and go on to the next thread.
2766 if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) {
2767 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
2768 continue;
2769 }
2770
2771 // For every thread in this group, copy the mask to the thread's entry in
2772 // the osId2Mask table. Mark the first address as a leader.
2773 for (; j < i; j++) {
2774 int osId = __kmp_topology->at(j).os_id;
2775 KMP_DEBUG_ASSERT(osId <= maxOsId);
2776 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2777 KMP_CPU_COPY(mask, sum);
2778 __kmp_topology->at(j).leader = (j == leader);
2779 }
2780 unique++;
2781
2782 // Start a new mask.
2783 leader = i;
2784 KMP_CPU_ZERO(sum);
2785 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
2786 }
2787
2788 // For every thread in last group, copy the mask to the thread's
2789 // entry in the osId2Mask table.
2790 for (; j < i; j++) {
2791 int osId = __kmp_topology->at(j).os_id;
2792 KMP_DEBUG_ASSERT(osId <= maxOsId);
2793 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2794 KMP_CPU_COPY(mask, sum);
2795 __kmp_topology->at(j).leader = (j == leader);
2796 }
2797 unique++;
2798 KMP_CPU_FREE_FROM_STACK(sum);
2799
2800 *maxIndex = maxOsId;
2801 *numUnique = unique;
2802 return osId2Mask;
2803}
2804
2805// Stuff for the affinity proclist parsers. It's easier to declare these vars
2806// as file-static than to try and pass them through the calling sequence of
2807// the recursive-descent OMP_PLACES parser.
2808static kmp_affin_mask_t *newMasks;
2809static int numNewMasks;
2810static int nextNewMask;
2811
2812#define ADD_MASK(_mask) \
2813 { \
2814 if (nextNewMask >= numNewMasks) { \
2815 int i; \
2816 numNewMasks *= 2; \
2817 kmp_affin_mask_t *temp; \
2818 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
2819 for (i = 0; i < numNewMasks / 2; i++) { \
2820 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
2821 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
2822 KMP_CPU_COPY(dest, src); \
2823 } \
2824 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
2825 newMasks = temp; \
2826 } \
2827 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2828 nextNewMask++; \
2829 }
2830
2831#define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
2832 { \
2833 if (((_osId) > _maxOsId) || \
2834 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2835 if (__kmp_affinity_verbose || \
2836 (__kmp_affinity_warnings && \
2837 (__kmp_affinity_type != affinity_none))) { \
2838 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2839 } \
2840 } else { \
2841 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2842 } \
2843 }
2844
2845// Re-parse the proclist (for the explicit affinity type), and form the list
2846// of affinity newMasks indexed by gtid.
2847static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2848 unsigned int *out_numMasks,
2849 const char *proclist,
2850 kmp_affin_mask_t *osId2Mask,
2851 int maxOsId) {
2852 int i;
2853 const char *scan = proclist;
2854 const char *next = proclist;
2855
2856 // We use malloc() for the temporary mask vector, so that we can use
2857 // realloc() to extend it.
2858 numNewMasks = 2;
2859 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2860 nextNewMask = 0;
2861 kmp_affin_mask_t *sumMask;
2862 KMP_CPU_ALLOC(sumMask);
2863 int setSize = 0;
2864
2865 for (;;) {
2866 int start, end, stride;
2867
2868 SKIP_WS(scan);
2869 next = scan;
2870 if (*next == '\0') {
2871 break;
2872 }
2873
2874 if (*next == '{') {
2875 int num;
2876 setSize = 0;
2877 next++; // skip '{'
2878 SKIP_WS(next);
2879 scan = next;
2880
2881 // Read the first integer in the set.
2882 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
2883 SKIP_DIGITS(next);
2884 num = __kmp_str_to_int(scan, *next);
2885 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2886
2887 // Copy the mask for that osId to the sum (union) mask.
2888 if ((num > maxOsId) ||
2889 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2890 if (__kmp_affinity_verbose ||
2891 (__kmp_affinity_warnings &&
2892 (__kmp_affinity_type != affinity_none))) {
2893 KMP_WARNING(AffIgnoreInvalidProcID, num);
2894 }
2895 KMP_CPU_ZERO(sumMask);
2896 } else {
2897 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2898 setSize = 1;
2899 }
2900
2901 for (;;) {
2902 // Check for end of set.
2903 SKIP_WS(next);
2904 if (*next == '}') {
2905 next++; // skip '}'
2906 break;
2907 }
2908
2909 // Skip optional comma.
2910 if (*next == ',') {
2911 next++;
2912 }
2913 SKIP_WS(next);
2914
2915 // Read the next integer in the set.
2916 scan = next;
2917 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2918
2919 SKIP_DIGITS(next);
2920 num = __kmp_str_to_int(scan, *next);
2921 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2922
2923 // Add the mask for that osId to the sum mask.
2924 if ((num > maxOsId) ||
2925 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2926 if (__kmp_affinity_verbose ||
2927 (__kmp_affinity_warnings &&
2928 (__kmp_affinity_type != affinity_none))) {
2929 KMP_WARNING(AffIgnoreInvalidProcID, num);
2930 }
2931 } else {
2932 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2933 setSize++;
2934 }
2935 }
2936 if (setSize > 0) {
2937 ADD_MASK(sumMask);
2938 }
2939
2940 SKIP_WS(next);
2941 if (*next == ',') {
2942 next++;
2943 }
2944 scan = next;
2945 continue;
2946 }
2947
2948 // Read the first integer.
2949 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2950 SKIP_DIGITS(next);
2951 start = __kmp_str_to_int(scan, *next);
2952 KMP_ASSERT2(start >= 0, "bad explicit proc list");
2953 SKIP_WS(next);
2954
2955 // If this isn't a range, then add a mask to the list and go on.
2956 if (*next != '-') {
2957 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2958
2959 // Skip optional comma.
2960 if (*next == ',') {
2961 next++;
2962 }
2963 scan = next;
2964 continue;
2965 }
2966
2967 // This is a range. Skip over the '-' and read in the 2nd int.
2968 next++; // skip '-'
2969 SKIP_WS(next);
2970 scan = next;
2971 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2972 SKIP_DIGITS(next);
2973 end = __kmp_str_to_int(scan, *next);
2974 KMP_ASSERT2(end >= 0, "bad explicit proc list");
2975
2976 // Check for a stride parameter
2977 stride = 1;
2978 SKIP_WS(next);
2979 if (*next == ':') {
2980 // A stride is specified. Skip over the ':" and read the 3rd int.
2981 int sign = +1;
2982 next++; // skip ':'
2983 SKIP_WS(next);
2984 scan = next;
2985 if (*next == '-') {
2986 sign = -1;
2987 next++;
2988 SKIP_WS(next);
2989 scan = next;
2990 }
2991 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2992 SKIP_DIGITS(next);
2993 stride = __kmp_str_to_int(scan, *next);
2994 KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2995 stride *= sign;
2996 }
2997
2998 // Do some range checks.
2999 KMP_ASSERT2(stride != 0, "bad explicit proc list");
3000 if (stride > 0) {
3001 KMP_ASSERT2(start <= end, "bad explicit proc list");
3002 } else {
3003 KMP_ASSERT2(start >= end, "bad explicit proc list");
3004 }
3005 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3006
3007 // Add the mask for each OS proc # to the list.
3008 if (stride > 0) {
3009 do {
3010 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3011 start += stride;
3012 } while (start <= end);
3013 } else {
3014 do {
3015 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3016 start += stride;
3017 } while (start >= end);
3018 }
3019
3020 // Skip optional comma.
3021 SKIP_WS(next);
3022 if (*next == ',') {
3023 next++;
3024 }
3025 scan = next;
3026 }
3027
3028 *out_numMasks = nextNewMask;
3029 if (nextNewMask == 0) {
3030 *out_masks = NULL;
3031 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3032 return;
3033 }
3034 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3035 for (i = 0; i < nextNewMask; i++) {
3036 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3037 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3038 KMP_CPU_COPY(dest, src);
3039 }
3040 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3041 KMP_CPU_FREE(sumMask);
3042}
3043
3044/*-----------------------------------------------------------------------------
3045Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3046places. Again, Here is the grammar:
3047
3048place_list := place
3049place_list := place , place_list
3050place := num
3051place := place : num
3052place := place : num : signed
3053place := { subplacelist }
3054place := ! place // (lowest priority)
3055subplace_list := subplace
3056subplace_list := subplace , subplace_list
3057subplace := num
3058subplace := num : num
3059subplace := num : num : signed
3060signed := num
3061signed := + signed
3062signed := - signed
3063-----------------------------------------------------------------------------*/
3064static void __kmp_process_subplace_list(const char **scan,
3065 kmp_affin_mask_t *osId2Mask,
3066 int maxOsId, kmp_affin_mask_t *tempMask,
3067 int *setSize) {
3068 const char *next;
3069
3070 for (;;) {
3071 int start, count, stride, i;
3072
3073 // Read in the starting proc id
3074 SKIP_WS(*scan);
3075 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3076 next = *scan;
3077 SKIP_DIGITS(next);
3078 start = __kmp_str_to_int(*scan, *next);
3079 KMP_ASSERT(start >= 0);
3080 *scan = next;
3081
3082 // valid follow sets are ',' ':' and '}'
3083 SKIP_WS(*scan);
3084 if (**scan == '}' || **scan == ',') {
3085 if ((start > maxOsId) ||
3086 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3087 if (__kmp_affinity_verbose ||
3088 (__kmp_affinity_warnings &&
3089 (__kmp_affinity_type != affinity_none))) {
3090 KMP_WARNING(AffIgnoreInvalidProcID, start);
3091 }
3092 } else {
3093 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3094 (*setSize)++;
3095 }
3096 if (**scan == '}') {
3097 break;
3098 }
3099 (*scan)++; // skip ','
3100 continue;
3101 }
3102 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3103 (*scan)++; // skip ':'
3104
3105 // Read count parameter
3106 SKIP_WS(*scan);
3107 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3108 next = *scan;
3109 SKIP_DIGITS(next);
3110 count = __kmp_str_to_int(*scan, *next);
3111 KMP_ASSERT(count >= 0);
3112 *scan = next;
3113
3114 // valid follow sets are ',' ':' and '}'
3115 SKIP_WS(*scan);
3116 if (**scan == '}' || **scan == ',') {
3117 for (i = 0; i < count; i++) {
3118 if ((start > maxOsId) ||
3119 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3120 if (__kmp_affinity_verbose ||
3121 (__kmp_affinity_warnings &&
3122 (__kmp_affinity_type != affinity_none))) {
3123 KMP_WARNING(AffIgnoreInvalidProcID, start);
3124 }
3125 break; // don't proliferate warnings for large count
3126 } else {
3127 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3128 start++;
3129 (*setSize)++;
3130 }
3131 }
3132 if (**scan == '}') {
3133 break;
3134 }
3135 (*scan)++; // skip ','
3136 continue;
3137 }
3138 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3139 (*scan)++; // skip ':'
3140
3141 // Read stride parameter
3142 int sign = +1;
3143 for (;;) {
3144 SKIP_WS(*scan);
3145 if (**scan == '+') {
3146 (*scan)++; // skip '+'
3147 continue;
3148 }
3149 if (**scan == '-') {
3150 sign *= -1;
3151 (*scan)++; // skip '-'
3152 continue;
3153 }
3154 break;
3155 }
3156 SKIP_WS(*scan);
3157 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3158 next = *scan;
3159 SKIP_DIGITS(next);
3160 stride = __kmp_str_to_int(*scan, *next);
3161 KMP_ASSERT(stride >= 0);
3162 *scan = next;
3163 stride *= sign;
3164
3165 // valid follow sets are ',' and '}'
3166 SKIP_WS(*scan);
3167 if (**scan == '}' || **scan == ',') {
3168 for (i = 0; i < count; i++) {
3169 if ((start > maxOsId) ||
3170 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3171 if (__kmp_affinity_verbose ||
3172 (__kmp_affinity_warnings &&
3173 (__kmp_affinity_type != affinity_none))) {
3174 KMP_WARNING(AffIgnoreInvalidProcID, start);
3175 }
3176 break; // don't proliferate warnings for large count
3177 } else {
3178 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3179 start += stride;
3180 (*setSize)++;
3181 }
3182 }
3183 if (**scan == '}') {
3184 break;
3185 }
3186 (*scan)++; // skip ','
3187 continue;
3188 }
3189
3190 KMP_ASSERT2(0, "bad explicit places list");
3191 }
3192}
3193
3194static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3195 int maxOsId, kmp_affin_mask_t *tempMask,
3196 int *setSize) {
3197 const char *next;
3198
3199 // valid follow sets are '{' '!' and num
3200 SKIP_WS(*scan);
3201 if (**scan == '{') {
3202 (*scan)++; // skip '{'
3203 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
3204 KMP_ASSERT2(**scan == '}', "bad explicit places list");
3205 (*scan)++; // skip '}'
3206 } else if (**scan == '!') {
3207 (*scan)++; // skip '!'
3208 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3209 KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3210 } else if ((**scan >= '0') && (**scan <= '9')) {
3211 next = *scan;
3212 SKIP_DIGITS(next);
3213 int num = __kmp_str_to_int(*scan, *next);
3214 KMP_ASSERT(num >= 0);
3215 if ((num > maxOsId) ||
3216 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3217 if (__kmp_affinity_verbose ||
3218 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3219 KMP_WARNING(AffIgnoreInvalidProcID, num);
3220 }
3221 } else {
3222 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3223 (*setSize)++;
3224 }
3225 *scan = next; // skip num
3226 } else {
3227 KMP_ASSERT2(0, "bad explicit places list");
3228 }
3229}
3230
3231// static void
3232void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3233 unsigned int *out_numMasks,
3234 const char *placelist,
3235 kmp_affin_mask_t *osId2Mask,
3236 int maxOsId) {
3237 int i, j, count, stride, sign;
3238 const char *scan = placelist;
3239 const char *next = placelist;
3240
3241 numNewMasks = 2;
3242 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3243 nextNewMask = 0;
3244
3245 // tempMask is modified based on the previous or initial
3246 // place to form the current place
3247 // previousMask contains the previous place
3248 kmp_affin_mask_t *tempMask;
3249 kmp_affin_mask_t *previousMask;
3250 KMP_CPU_ALLOC(tempMask);
3251 KMP_CPU_ZERO(tempMask);
3252 KMP_CPU_ALLOC(previousMask);
3253 KMP_CPU_ZERO(previousMask);
3254 int setSize = 0;
3255
3256 for (;;) {
3257 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3258
3259 // valid follow sets are ',' ':' and EOL
3260 SKIP_WS(scan);
3261 if (*scan == '\0' || *scan == ',') {
3262 if (setSize > 0) {
3263 ADD_MASK(tempMask);
3264 }
3265 KMP_CPU_ZERO(tempMask);
3266 setSize = 0;
3267 if (*scan == '\0') {
3268 break;
3269 }
3270 scan++; // skip ','
3271 continue;
3272 }
3273
3274 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3275 scan++; // skip ':'
3276
3277 // Read count parameter
3278 SKIP_WS(scan);
3279 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3280 next = scan;
3281 SKIP_DIGITS(next);
3282 count = __kmp_str_to_int(scan, *next);
3283 KMP_ASSERT(count >= 0);
3284 scan = next;
3285
3286 // valid follow sets are ',' ':' and EOL
3287 SKIP_WS(scan);
3288 if (*scan == '\0' || *scan == ',') {
3289 stride = +1;
3290 } else {
3291 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3292 scan++; // skip ':'
3293
3294 // Read stride parameter
3295 sign = +1;
3296 for (;;) {
3297 SKIP_WS(scan);
3298 if (*scan == '+') {
3299 scan++; // skip '+'
3300 continue;
3301 }
3302 if (*scan == '-') {
3303 sign *= -1;
3304 scan++; // skip '-'
3305 continue;
3306 }
3307 break;
3308 }
3309 SKIP_WS(scan);
3310 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3311 next = scan;
3312 SKIP_DIGITS(next);
3313 stride = __kmp_str_to_int(scan, *next);
3314 KMP_DEBUG_ASSERT(stride >= 0);
3315 scan = next;
3316 stride *= sign;
3317 }
3318
3319 // Add places determined by initial_place : count : stride
3320 for (i = 0; i < count; i++) {
3321 if (setSize == 0) {
3322 break;
3323 }
3324 // Add the current place, then build the next place (tempMask) from that
3325 KMP_CPU_COPY(previousMask, tempMask);
3326 ADD_MASK(previousMask);
3327 KMP_CPU_ZERO(tempMask);
3328 setSize = 0;
3329 KMP_CPU_SET_ITERATE(j, previousMask) {
3330 if (!KMP_CPU_ISSET(j, previousMask)) {
3331 continue;
3332 }
3333 if ((j + stride > maxOsId) || (j + stride < 0) ||
3334 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3335 (!KMP_CPU_ISSET(j + stride,
3336 KMP_CPU_INDEX(osId2Mask, j + stride)))) {
3337 if ((__kmp_affinity_verbose ||
3338 (__kmp_affinity_warnings &&
3339 (__kmp_affinity_type != affinity_none))) &&
3340 i < count - 1) {
3341 KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
3342 }
3343 continue;
3344 }
3345 KMP_CPU_SET(j + stride, tempMask);
3346 setSize++;
3347 }
3348 }
3349 KMP_CPU_ZERO(tempMask);
3350 setSize = 0;
3351
3352 // valid follow sets are ',' and EOL
3353 SKIP_WS(scan);
3354 if (*scan == '\0') {
3355 break;
3356 }
3357 if (*scan == ',') {
3358 scan++; // skip ','
3359 continue;
3360 }
3361
3362 KMP_ASSERT2(0, "bad explicit places list");
3363 }
3364
3365 *out_numMasks = nextNewMask;
3366 if (nextNewMask == 0) {
3367 *out_masks = NULL;
3368 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3369 return;
3370 }
3371 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3372 KMP_CPU_FREE(tempMask);
3373 KMP_CPU_FREE(previousMask);
3374 for (i = 0; i < nextNewMask; i++) {
3375 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3376 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3377 KMP_CPU_COPY(dest, src);
3378 }
3379 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3380}
3381
3382#undef ADD_MASK
3383#undef ADD_MASK_OSID
3384
3385// This function figures out the deepest level at which there is at least one
3386// cluster/core with more than one processing unit bound to it.
3387static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
3388 int core_level = 0;
3389
3390 for (int i = 0; i < nprocs; i++) {
3391 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3392 for (int j = bottom_level; j > 0; j--) {
3393 if (hw_thread.ids[j] > 0) {
3394 if (core_level < (j - 1)) {
3395 core_level = j - 1;
3396 }
3397 }
3398 }
3399 }
3400 return core_level;
3401}
3402
3403// This function counts number of clusters/cores at given level.
3404static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
3405 int core_level) {
3406 return __kmp_topology->get_count(core_level);
3407}
3408// This function finds to which cluster/core given processing unit is bound.
3409static int __kmp_affinity_find_core(int proc, int bottom_level,
3410 int core_level) {
3411 int core = 0;
3412 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
3413 for (int i = 0; i <= proc; ++i) {
3414 if (i + 1 <= proc) {
3415 for (int j = 0; j <= core_level; ++j) {
3416 if (__kmp_topology->at(i + 1).sub_ids[j] !=
3417 __kmp_topology->at(i).sub_ids[j]) {
3418 core++;
3419 break;
3420 }
3421 }
3422 }
3423 }
3424 return core;
3425}
3426
3427// This function finds maximal number of processing units bound to a
3428// cluster/core at given level.
3429static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
3430 int core_level) {
3431 if (core_level >= bottom_level)
3432 return 1;
3433 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
3434 return __kmp_topology->calculate_ratio(thread_level, core_level);
3435}
3436
3437static int *procarr = NULL;
3438static int __kmp_aff_depth = 0;
3439
3440// Create a one element mask array (set of places) which only contains the
3441// initial process's affinity mask
3442static void __kmp_create_affinity_none_places() {
3443 KMP_ASSERT(__kmp_affin_fullMask != NULL);
3444 KMP_ASSERT(__kmp_affinity_type == affinity_none);
3445 __kmp_affinity_num_masks = 1;
3446 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3447 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
3448 KMP_CPU_COPY(dest, __kmp_affin_fullMask);
3449}
3450
3451static void __kmp_aux_affinity_initialize(void) {
3452 if (__kmp_affinity_masks != NULL) {
3453 KMP_ASSERT(__kmp_affin_fullMask != NULL);
3454 return;
3455 }
3456
3457 // Create the "full" mask - this defines all of the processors that we
3458 // consider to be in the machine model. If respect is set, then it is the
3459 // initialization thread's affinity mask. Otherwise, it is all processors that
3460 // we know about on the machine.
3461 if (__kmp_affin_fullMask == NULL) {
3462 KMP_CPU_ALLOC(__kmp_affin_fullMask);
3463 }
3464 if (KMP_AFFINITY_CAPABLE()) {
3465 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
3466 if (__kmp_affinity_respect_mask) {
3467 // Count the number of available processors.
3468 unsigned i;
3469 __kmp_avail_proc = 0;
3470 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
3471 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
3472 continue;
3473 }
3474 __kmp_avail_proc++;
3475 }
3476 if (__kmp_avail_proc > __kmp_xproc) {
3477 if (__kmp_affinity_verbose ||
3478 (__kmp_affinity_warnings &&
3479 (__kmp_affinity_type != affinity_none))) {
3480 KMP_WARNING(ErrorInitializeAffinity);
3481 }
3482 __kmp_affinity_type = affinity_none;
3483 KMP_AFFINITY_DISABLE();
3484 return;
3485 }
3486
3487 if (__kmp_affinity_verbose) {
3488 char buf[KMP_AFFIN_MASK_PRINT_LEN];
3489 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3490 __kmp_affin_fullMask);
3491 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
3492 }
3493 } else {
3494 if (__kmp_affinity_verbose) {
3495 char buf[KMP_AFFIN_MASK_PRINT_LEN];
3496 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
3497 __kmp_affin_fullMask);
3498 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
3499 }
3500 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
3501 __kmp_avail_proc = __kmp_xproc;
3502#if KMP_OS_WINDOWS
3503 // Set the process affinity mask since threads' affinity
3504 // masks must be subset of process mask in Windows* OS
3505 __kmp_affin_fullMask->set_process_affinity(true);
3506#endif
3507 }
3508 }
3509
3510 kmp_i18n_id_t msg_id = kmp_i18n_null;
3511
3512 // For backward compatibility, setting KMP_CPUINFO_FILE =>
3513 // KMP_TOPOLOGY_METHOD=cpuinfo
3514 if ((__kmp_cpuinfo_file != NULL) &&
3515 (__kmp_affinity_top_method == affinity_top_method_all)) {
3516 __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3517 }
3518
3519 bool success = false;
3520 if (__kmp_affinity_top_method == affinity_top_method_all) {
3521// In the default code path, errors are not fatal - we just try using
3522// another method. We only emit a warning message if affinity is on, or the
3523// verbose flag is set, an the nowarnings flag was not set.
3524#if KMP_USE_HWLOC
3525 if (!success &&
3526 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3527 if (!__kmp_hwloc_error) {
3528 success = __kmp_affinity_create_hwloc_map(&msg_id);
3529 if (!success && __kmp_affinity_verbose) {
3530 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3531 }
3532 } else if (__kmp_affinity_verbose) {
3533 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
3534 }
3535 }
3536#endif
3537
3538#if KMP_ARCH_X86 || KMP_ARCH_X86_64
3539 if (!success) {
3540 success = __kmp_affinity_create_x2apicid_map(&msg_id);
3541 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3542 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3543 }
3544 }
3545 if (!success) {
3546 success = __kmp_affinity_create_apicid_map(&msg_id);
3547 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3548 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3549 }
3550 }
3551#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3552
3553#if KMP_OS_LINUX
3554 if (!success) {
3555 int line = 0;
3556 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
3557 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3558 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3559 }
3560 }
3561#endif /* KMP_OS_LINUX */
3562
3563#if KMP_GROUP_AFFINITY
3564 if (!success && (__kmp_num_proc_groups > 1)) {
3565 success = __kmp_affinity_create_proc_group_map(&msg_id);
3566 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3567 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3568 }
3569 }
3570#endif /* KMP_GROUP_AFFINITY */
3571
3572 if (!success) {
3573 success = __kmp_affinity_create_flat_map(&msg_id);
3574 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) {
3575 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id));
3576 }
3577 KMP_ASSERT(success);
3578 }
3579 }
3580
3581// If the user has specified that a paricular topology discovery method is to be
3582// used, then we abort if that method fails. The exception is group affinity,
3583// which might have been implicitly set.
3584#if KMP_USE_HWLOC
3585 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
3586 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
3587 success = __kmp_affinity_create_hwloc_map(&msg_id);
3588 if (!success) {
3589 KMP_ASSERT(msg_id != kmp_i18n_null);
3590 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3591 }
3592 }
3593#endif // KMP_USE_HWLOC
3594
3595#if KMP_ARCH_X86 || KMP_ARCH_X86_64
3596 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
3597 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
3598 success = __kmp_affinity_create_x2apicid_map(&msg_id);
3599 if (!success) {
3600 KMP_ASSERT(msg_id != kmp_i18n_null);
3601 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3602 }
3603 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3604 success = __kmp_affinity_create_apicid_map(&msg_id);
3605 if (!success) {
3606 KMP_ASSERT(msg_id != kmp_i18n_null);
3607 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3608 }
3609 }
3610#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3611
3612 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3613 int line = 0;
3614 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
3615 if (!success) {
3616 KMP_ASSERT(msg_id != kmp_i18n_null);
3617 const char *filename = __kmp_cpuinfo_get_filename();
3618 if (line > 0) {
3619 KMP_FATAL(FileLineMsgExiting, filename, line,
3620 __kmp_i18n_catgets(msg_id));
3621 } else {
3622 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3623 }
3624 }
3625 }
3626
3627#if KMP_GROUP_AFFINITY
3628 else if (__kmp_affinity_top_method == affinity_top_method_group) {
3629 success = __kmp_affinity_create_proc_group_map(&msg_id);
3630 KMP_ASSERT(success);
3631 if (!success) {
3632 KMP_ASSERT(msg_id != kmp_i18n_null);
3633 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3634 }
3635 }
3636#endif /* KMP_GROUP_AFFINITY */
3637
3638 else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3639 success = __kmp_affinity_create_flat_map(&msg_id);
3640 // should not fail
3641 KMP_ASSERT(success);
3642 }
3643
3644 // Early exit if topology could not be created
3645 if (!__kmp_topology) {
3646 if (KMP_AFFINITY_CAPABLE() &&
3647 (__kmp_affinity_verbose ||
3648 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
3649 KMP_WARNING(ErrorInitializeAffinity);
3650 }
3651 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
3652 __kmp_ncores > 0) {
3653 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
3654 __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
3655 __kmp_nThreadsPerCore, __kmp_ncores);
3656 if (__kmp_affinity_verbose) {
3657 __kmp_topology->print("KMP_AFFINITY");
3658 }
3659 }
3660 __kmp_affinity_type = affinity_none;
3661 __kmp_create_affinity_none_places();
3662#if KMP_USE_HIER_SCHED
3663 __kmp_dispatch_set_hierarchy_values();
3664#endif
3665 KMP_AFFINITY_DISABLE();
3666 return;
3667 }
3668
3669 // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and
3670 // initialize other data structures which depend on the topology
3671 __kmp_topology->canonicalize();
3672 if (__kmp_affinity_verbose)
3673 __kmp_topology->print("KMP_AFFINITY");
3674 bool filtered = __kmp_topology->filter_hw_subset();
3675 if (filtered && __kmp_affinity_verbose)
3676 __kmp_topology->print("KMP_HW_SUBSET");
3677 machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
3678 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
3679 // If KMP_AFFINITY=none, then only create the single "none" place
3680 // which is the process's initial affinity mask or the number of
3681 // hardware threads depending on respect,norespect
3682 if (__kmp_affinity_type == affinity_none) {
3683 __kmp_create_affinity_none_places();
3684#if KMP_USE_HIER_SCHED
3685 __kmp_dispatch_set_hierarchy_values();
3686#endif
3687 return;
3688 }
3689 int depth = __kmp_topology->get_depth();
3690
3691 // Create the table of masks, indexed by thread Id.
3692 unsigned maxIndex;
3693 unsigned numUnique;
3694 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique);
3695 if (__kmp_affinity_gran_levels == 0) {
3696 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3697 }
3698
3699 switch (__kmp_affinity_type) {
3700
3701 case affinity_explicit:
3702 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3703 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
3704 __kmp_affinity_process_proclist(
3705 &__kmp_affinity_masks, &__kmp_affinity_num_masks,
3706 __kmp_affinity_proclist, osId2Mask, maxIndex);
3707 } else {
3708 __kmp_affinity_process_placelist(
3709 &__kmp_affinity_masks, &__kmp_affinity_num_masks,
3710 __kmp_affinity_proclist, osId2Mask, maxIndex);
3711 }
3712 if (__kmp_affinity_num_masks == 0) {
3713 if (__kmp_affinity_verbose ||
3714 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3715 KMP_WARNING(AffNoValidProcID);
3716 }
3717 __kmp_affinity_type = affinity_none;
3718 __kmp_create_affinity_none_places();
3719 return;
3720 }
3721 break;
3722
3723 // The other affinity types rely on sorting the hardware threads according to
3724 // some permutation of the machine topology tree. Set __kmp_affinity_compact
3725 // and __kmp_affinity_offset appropriately, then jump to a common code
3726 // fragment to do the sort and create the array of affinity masks.
3727 case affinity_logical:
3728 __kmp_affinity_compact = 0;
3729 if (__kmp_affinity_offset) {
3730 __kmp_affinity_offset =
3731 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
3732 }
3733 goto sortTopology;
3734
3735 case affinity_physical:
3736 if (__kmp_nThreadsPerCore > 1) {
3737 __kmp_affinity_compact = 1;
3738 if (__kmp_affinity_compact >= depth) {
3739 __kmp_affinity_compact = 0;
3740 }
3741 } else {
3742 __kmp_affinity_compact = 0;
3743 }
3744 if (__kmp_affinity_offset) {
3745 __kmp_affinity_offset =
3746 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
3747 }
3748 goto sortTopology;
3749
3750 case affinity_scatter:
3751 if (__kmp_affinity_compact >= depth) {
3752 __kmp_affinity_compact = 0;
3753 } else {
3754 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3755 }
3756 goto sortTopology;
3757
3758 case affinity_compact:
3759 if (__kmp_affinity_compact >= depth) {
3760 __kmp_affinity_compact = depth - 1;
3761 }
3762 goto sortTopology;
3763
3764 case affinity_balanced:
3765 if (depth <= 1) {
3766 if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
3767 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
3768 }
3769 __kmp_affinity_type = affinity_none;
3770 __kmp_create_affinity_none_places();
3771 return;
3772 } else if (!__kmp_topology->is_uniform()) {
3773 // Save the depth for further usage
3774 __kmp_aff_depth = depth;
3775
3776 int core_level =
3777 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
3778 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
3779 core_level);
3780 int maxprocpercore = __kmp_affinity_max_proc_per_core(
3781 __kmp_avail_proc, depth - 1, core_level);
3782
3783 int nproc = ncores * maxprocpercore;
3784 if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
3785 if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
3786 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
3787 }
3788 __kmp_affinity_type = affinity_none;
3789 return;
3790 }
3791
3792 procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
3793 for (int i = 0; i < nproc; i++) {
3794 procarr[i] = -1;
3795 }
3796
3797 int lastcore = -1;
3798 int inlastcore = 0;
3799 for (int i = 0; i < __kmp_avail_proc; i++) {
3800 int proc = __kmp_topology->at(i).os_id;
3801 int core = __kmp_affinity_find_core(i, depth - 1, core_level);
3802
3803 if (core == lastcore) {
3804 inlastcore++;
3805 } else {
3806 inlastcore = 0;
3807 }
3808 lastcore = core;
3809
3810 procarr[core * maxprocpercore + inlastcore] = proc;
3811 }
3812 }
3813 if (__kmp_affinity_compact >= depth) {
3814 __kmp_affinity_compact = depth - 1;
3815 }
3816
3817 sortTopology:
3818 // Allocate the gtid->affinity mask table.
3819 if (__kmp_affinity_dups) {
3820 __kmp_affinity_num_masks = __kmp_avail_proc;
3821 } else {
3822 __kmp_affinity_num_masks = numUnique;
3823 }
3824
3825 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
3826 (__kmp_affinity_num_places > 0) &&
3827 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
3828 __kmp_affinity_num_masks = __kmp_affinity_num_places;
3829 }
3830
3831 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3832
3833 // Sort the topology table according to the current setting of
3834 // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3835 __kmp_topology->sort_compact();
3836 {
3837 int i;
3838 unsigned j;
3839 int num_hw_threads = __kmp_topology->get_num_hw_threads();
3840 for (i = 0, j = 0; i < num_hw_threads; i++) {
3841 if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) {
3842 continue;
3843 }
3844 int osId = __kmp_topology->at(i).os_id;
3845
3846 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3847 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3848 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3849 KMP_CPU_COPY(dest, src);
3850 if (++j >= __kmp_affinity_num_masks) {
3851 break;
3852 }
3853 }
3854 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3855 }
3856 // Sort the topology back using ids
3857 __kmp_topology->sort_ids();
3858 break;
3859
3860 default:
3861 KMP_ASSERT2(0, "Unexpected affinity setting");
3862 }
3863
3864 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
3865}
3866
3867void __kmp_affinity_initialize(void) {
3868 // Much of the code above was written assuming that if a machine was not
3869 // affinity capable, then __kmp_affinity_type == affinity_none. We now
3870 // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3871 // There are too many checks for __kmp_affinity_type == affinity_none
3872 // in this code. Instead of trying to change them all, check if
3873 // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3874 // affinity_none, call the real initialization routine, then restore
3875 // __kmp_affinity_type to affinity_disabled.
3876 int disabled = (__kmp_affinity_type == affinity_disabled);
3877 if (!KMP_AFFINITY_CAPABLE()) {
3878 KMP_ASSERT(disabled);
3879 }
3880 if (disabled) {
3881 __kmp_affinity_type = affinity_none;
3882 }
3883 __kmp_aux_affinity_initialize();
3884 if (disabled) {
3885 __kmp_affinity_type = affinity_disabled;
3886 }
3887}
3888
3889void __kmp_affinity_uninitialize(void) {
3890 if (__kmp_affinity_masks != NULL) {
3891 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
3892 __kmp_affinity_masks = NULL;
3893 }
3894 if (__kmp_affin_fullMask != NULL) {
3895 KMP_CPU_FREE(__kmp_affin_fullMask);
3896 __kmp_affin_fullMask = NULL;
3897 }
3898 __kmp_affinity_num_masks = 0;
3899 __kmp_affinity_type = affinity_default;
3900 __kmp_affinity_num_places = 0;
3901 if (__kmp_affinity_proclist != NULL) {
3902 __kmp_free(__kmp_affinity_proclist);
3903 __kmp_affinity_proclist = NULL;
3904 }
3905 if (procarr != NULL) {
3906 __kmp_free(procarr);
3907 procarr = NULL;
3908 }
3909#if KMP_USE_HWLOC
3910 if (__kmp_hwloc_topology != NULL) {
3911 hwloc_topology_destroy(__kmp_hwloc_topology);
3912 __kmp_hwloc_topology = NULL;
3913 }
3914#endif
3915 if (__kmp_hw_subset) {
3916 kmp_hw_subset_t::deallocate(__kmp_hw_subset);
3917 __kmp_hw_subset = nullptr;
3918 }
3919 if (__kmp_topology) {
3920 kmp_topology_t::deallocate(__kmp_topology);
3921 __kmp_topology = nullptr;
3922 }
3923 KMPAffinity::destroy_api();
3924}
3925
3926void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
3927 if (!KMP_AFFINITY_CAPABLE()) {
3928 return;
3929 }
3930
3931 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3932 if (th->th.th_affin_mask == NULL) {
3933 KMP_CPU_ALLOC(th->th.th_affin_mask);
3934 } else {
3935 KMP_CPU_ZERO(th->th.th_affin_mask);
3936 }
3937
3938 // Copy the thread mask to the kmp_info_t structure. If
3939 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
3940 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
3941 // then the full mask is the same as the mask of the initialization thread.
3942 kmp_affin_mask_t *mask;
3943 int i;
3944
3945 if (KMP_AFFINITY_NON_PROC_BIND) {
3946 if ((__kmp_affinity_type == affinity_none) ||
3947 (__kmp_affinity_type == affinity_balanced) ||
3948 KMP_HIDDEN_HELPER_THREAD(gtid)) {
3949#if KMP_GROUP_AFFINITY
3950 if (__kmp_num_proc_groups > 1) {
3951 return;
3952 }
3953#endif
3954 KMP_ASSERT(__kmp_affin_fullMask != NULL);
3955 i = 0;
3956 mask = __kmp_affin_fullMask;
3957 } else {
3958 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
3959 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
3960 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3961 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3962 }
3963 } else {
3964 if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) ||
3965 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
3966#if KMP_GROUP_AFFINITY
3967 if (__kmp_num_proc_groups > 1) {
3968 return;
3969 }
3970#endif
3971 KMP_ASSERT(__kmp_affin_fullMask != NULL);
3972 i = KMP_PLACE_ALL;
3973 mask = __kmp_affin_fullMask;
3974 } else {
3975 // int i = some hash function or just a counter that doesn't
3976 // always start at 0. Use adjusted gtid for now.
3977 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
3978 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
3979 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3980 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3981 }
3982 }
3983
3984 th->th.th_current_place = i;
3985 if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) {
3986 th->th.th_new_place = i;
3987 th->th.th_first_place = 0;
3988 th->th.th_last_place = __kmp_affinity_num_masks - 1;
3989 } else if (KMP_AFFINITY_NON_PROC_BIND) {
3990 // When using a Non-OMP_PROC_BIND affinity method,
3991 // set all threads' place-partition-var to the entire place list
3992 th->th.th_first_place = 0;
3993 th->th.th_last_place = __kmp_affinity_num_masks - 1;
3994 }
3995
3996 if (i == KMP_PLACE_ALL) {
3997 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
3998 gtid));
3999 } else {
4000 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4001 gtid, i));
4002 }
4003
4004 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4005
4006 if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid)
4007 /* to avoid duplicate printing (will be correctly printed on barrier) */
4008 && (__kmp_affinity_type == affinity_none ||
4009 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
4010 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4011 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4012 th->th.th_affin_mask);
4013 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4014 __kmp_gettid(), gtid, buf);
4015 }
4016
4017#if KMP_DEBUG
4018 // Hidden helper thread affinity only printed for debug builds
4019 if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) {
4020 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4021 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4022 th->th.th_affin_mask);
4023 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)",
4024 (kmp_int32)getpid(), __kmp_gettid(), gtid, buf);
4025 }
4026#endif
4027
4028#if KMP_OS_WINDOWS
4029 // On Windows* OS, the process affinity mask might have changed. If the user
4030 // didn't request affinity and this call fails, just continue silently.
4031 // See CQ171393.
4032 if (__kmp_affinity_type == affinity_none) {
4033 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4034 } else
4035#endif
4036 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4037}
4038
4039void __kmp_affinity_set_place(int gtid) {
4040 if (!KMP_AFFINITY_CAPABLE()) {
4041 return;
4042 }
4043
4044 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4045
4046 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4047 "place = %d)\n",
4048 gtid, th->th.th_new_place, th->th.th_current_place));
4049
4050 // Check that the new place is within this thread's partition.
4051 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4052 KMP_ASSERT(th->th.th_new_place >= 0);
4053 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4054 if (th->th.th_first_place <= th->th.th_last_place) {
4055 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4056 (th->th.th_new_place <= th->th.th_last_place));
4057 } else {
4058 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4059 (th->th.th_new_place >= th->th.th_last_place));
4060 }
4061
4062 // Copy the thread mask to the kmp_info_t structure,
4063 // and set this thread's affinity.
4064 kmp_affin_mask_t *mask =
4065 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
4066 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4067 th->th.th_current_place = th->th.th_new_place;
4068
4069 if (__kmp_affinity_verbose) {
4070 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4071 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4072 th->th.th_affin_mask);
4073 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4074 __kmp_gettid(), gtid, buf);
4075 }
4076 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4077}
4078
4079int __kmp_aux_set_affinity(void **mask) {
4080 int gtid;
4081 kmp_info_t *th;
4082 int retval;
4083
4084 if (!KMP_AFFINITY_CAPABLE()) {
4085 return -1;
4086 }
4087
4088 gtid = __kmp_entry_gtid();
4089 KA_TRACE(
4090 1000, (""); {
4091 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4092 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4093 (kmp_affin_mask_t *)(*mask));
4094 __kmp_debug_printf(
4095 "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4096 gtid, buf);
4097 });
4098
4099 if (__kmp_env_consistency_check) {
4100 if ((mask == NULL) || (*mask == NULL)) {
4101 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4102 } else {
4103 unsigned proc;
4104 int num_procs = 0;
4105
4106 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
4107 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4108 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4109 }
4110 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4111 continue;
4112 }
4113 num_procs++;
4114 }
4115 if (num_procs == 0) {
4116 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4117 }
4118
4119#if KMP_GROUP_AFFINITY
4120 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4121 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4122 }
4123#endif /* KMP_GROUP_AFFINITY */
4124 }
4125 }
4126
4127 th = __kmp_threads[gtid];
4128 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4129 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4130 if (retval == 0) {
4131 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4132 }
4133
4134 th->th.th_current_place = KMP_PLACE_UNDEFINED;
4135 th->th.th_new_place = KMP_PLACE_UNDEFINED;
4136 th->th.th_first_place = 0;
4137 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4138
4139 // Turn off 4.0 affinity for the current tread at this parallel level.
4140 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4141
4142 return retval;
4143}
4144
4145int __kmp_aux_get_affinity(void **mask) {
4146 int gtid;
4147 int retval;
4148 kmp_info_t *th;
4149
4150 if (!KMP_AFFINITY_CAPABLE()) {
4151 return -1;
4152 }
4153
4154 gtid = __kmp_entry_gtid();
4155 th = __kmp_threads[gtid];
4156 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4157
4158 KA_TRACE(
4159 1000, (""); {
4160 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4161 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4162 th->th.th_affin_mask);
4163 __kmp_printf(
4164 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
4165 buf);
4166 });
4167
4168 if (__kmp_env_consistency_check) {
4169 if ((mask == NULL) || (*mask == NULL)) {
4170 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4171 }
4172 }
4173
4174#if !KMP_OS_WINDOWS
4175
4176 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4177 KA_TRACE(
4178 1000, (""); {
4179 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4180 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4181 (kmp_affin_mask_t *)(*mask));
4182 __kmp_printf(
4183 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
4184 buf);
4185 });
4186 return retval;
4187
4188#else
4189 (void)retval;
4190
4191 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4192 return 0;
4193
4194#endif /* KMP_OS_WINDOWS */
4195}
4196
4197int __kmp_aux_get_affinity_max_proc() {
4198 if (!KMP_AFFINITY_CAPABLE()) {
4199 return 0;
4200 }
4201#if KMP_GROUP_AFFINITY
4202 if (__kmp_num_proc_groups > 1) {
4203 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
4204 }
4205#endif
4206 return __kmp_xproc;
4207}
4208
4209int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
4210 if (!KMP_AFFINITY_CAPABLE()) {
4211 return -1;
4212 }
4213
4214 KA_TRACE(
4215 1000, (""); {
4216 int gtid = __kmp_entry_gtid();
4217 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4218 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4219 (kmp_affin_mask_t *)(*mask));
4220 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
4221 "affinity mask for thread %d = %s\n",
4222 proc, gtid, buf);
4223 });
4224
4225 if (__kmp_env_consistency_check) {
4226 if ((mask == NULL) || (*mask == NULL)) {
4227 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4228 }
4229 }
4230
4231 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4232 return -1;
4233 }
4234 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4235 return -2;
4236 }
4237
4238 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4239 return 0;
4240}
4241
4242int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
4243 if (!KMP_AFFINITY_CAPABLE()) {
4244 return -1;
4245 }
4246
4247 KA_TRACE(
4248 1000, (""); {
4249 int gtid = __kmp_entry_gtid();
4250 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4251 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4252 (kmp_affin_mask_t *)(*mask));
4253 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
4254 "affinity mask for thread %d = %s\n",
4255 proc, gtid, buf);
4256 });
4257
4258 if (__kmp_env_consistency_check) {
4259 if ((mask == NULL) || (*mask == NULL)) {
4260 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4261 }
4262 }
4263
4264 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4265 return -1;
4266 }
4267 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4268 return -2;
4269 }
4270
4271 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4272 return 0;
4273}
4274
4275int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
4276 if (!KMP_AFFINITY_CAPABLE()) {
4277 return -1;
4278 }
4279
4280 KA_TRACE(
4281 1000, (""); {
4282 int gtid = __kmp_entry_gtid();
4283 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4284 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4285 (kmp_affin_mask_t *)(*mask));
4286 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
4287 "affinity mask for thread %d = %s\n",
4288 proc, gtid, buf);
4289 });
4290
4291 if (__kmp_env_consistency_check) {
4292 if ((mask == NULL) || (*mask == NULL)) {
4293 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4294 }
4295 }
4296
4297 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4298 return -1;
4299 }
4300 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4301 return 0;
4302 }
4303
4304 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4305}
4306
4307// Dynamic affinity settings - Affinity balanced
4308void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
4309 KMP_DEBUG_ASSERT(th);
4310 bool fine_gran = true;
4311 int tid = th->th.th_info.ds.ds_tid;
4312
4313 // Do not perform balanced affinity for the hidden helper threads
4314 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
4315 return;
4316
4317 switch (__kmp_affinity_gran) {
4318 case KMP_HW_THREAD:
4319 break;
4320 case KMP_HW_CORE:
4321 if (__kmp_nThreadsPerCore > 1) {
4322 fine_gran = false;
4323 }
4324 break;
4325 case KMP_HW_SOCKET:
4326 if (nCoresPerPkg > 1) {
4327 fine_gran = false;
4328 }
4329 break;
4330 default:
4331 fine_gran = false;
4332 }
4333
4334 if (__kmp_topology->is_uniform()) {
4335 int coreID;
4336 int threadID;
4337 // Number of hyper threads per core in HT machine
4338 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4339 // Number of cores
4340 int ncores = __kmp_ncores;
4341 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
4342 __kmp_nth_per_core = __kmp_avail_proc / nPackages;
4343 ncores = nPackages;
4344 }
4345 // How many threads will be bound to each core
4346 int chunk = nthreads / ncores;
4347 // How many cores will have an additional thread bound to it - "big cores"
4348 int big_cores = nthreads % ncores;
4349 // Number of threads on the big cores
4350 int big_nth = (chunk + 1) * big_cores;
4351 if (tid < big_nth) {
4352 coreID = tid / (chunk + 1);
4353 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
4354 } else { // tid >= big_nth
4355 coreID = (tid - big_cores) / chunk;
4356 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
4357 }
4358 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4359 "Illegal set affinity operation when not capable");
4360
4361 kmp_affin_mask_t *mask = th->th.th_affin_mask;
4362 KMP_CPU_ZERO(mask);
4363
4364 if (fine_gran) {
4365 int osID =
4366 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
4367 KMP_CPU_SET(osID, mask);
4368 } else {
4369 for (int i = 0; i < __kmp_nth_per_core; i++) {
4370 int osID;
4371 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
4372 KMP_CPU_SET(osID, mask);
4373 }
4374 }
4375 if (__kmp_affinity_verbose) {
4376 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4377 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4378 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4379 __kmp_gettid(), tid, buf);
4380 }
4381 __kmp_set_system_affinity(mask, TRUE);
4382 } else { // Non-uniform topology
4383
4384 kmp_affin_mask_t *mask = th->th.th_affin_mask;
4385 KMP_CPU_ZERO(mask);
4386
4387 int core_level =
4388 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
4389 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
4390 __kmp_aff_depth - 1, core_level);
4391 int nth_per_core = __kmp_affinity_max_proc_per_core(
4392 __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
4393
4394 // For performance gain consider the special case nthreads ==
4395 // __kmp_avail_proc
4396 if (nthreads == __kmp_avail_proc) {
4397 if (fine_gran) {
4398 int osID = __kmp_topology->at(tid).os_id;
4399 KMP_CPU_SET(osID, mask);
4400 } else {
4401 int core =
4402 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
4403 for (int i = 0; i < __kmp_avail_proc; i++) {
4404 int osID = __kmp_topology->at(i).os_id;
4405 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
4406 core) {
4407 KMP_CPU_SET(osID, mask);
4408 }
4409 }
4410 }
4411 } else if (nthreads <= ncores) {
4412
4413 int core = 0;
4414 for (int i = 0; i < ncores; i++) {
4415 // Check if this core from procarr[] is in the mask
4416 int in_mask = 0;
4417 for (int j = 0; j < nth_per_core; j++) {
4418 if (procarr[i * nth_per_core + j] != -1) {
4419 in_mask = 1;
4420 break;
4421 }
4422 }
4423 if (in_mask) {
4424 if (tid == core) {
4425 for (int j = 0; j < nth_per_core; j++) {
4426 int osID = procarr[i * nth_per_core + j];
4427 if (osID != -1) {
4428 KMP_CPU_SET(osID, mask);
4429 // For fine granularity it is enough to set the first available
4430 // osID for this core
4431 if (fine_gran) {
4432 break;
4433 }
4434 }
4435 }
4436 break;
4437 } else {
4438 core++;
4439 }
4440 }
4441 }
4442 } else { // nthreads > ncores
4443 // Array to save the number of processors at each core
4444 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
4445 // Array to save the number of cores with "x" available processors;
4446 int *ncores_with_x_procs =
4447 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
4448 // Array to save the number of cores with # procs from x to nth_per_core
4449 int *ncores_with_x_to_max_procs =
4450 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
4451
4452 for (int i = 0; i <= nth_per_core; i++) {
4453 ncores_with_x_procs[i] = 0;
4454 ncores_with_x_to_max_procs[i] = 0;
4455 }
4456
4457 for (int i = 0; i < ncores; i++) {
4458 int cnt = 0;
4459 for (int j = 0; j < nth_per_core; j++) {
4460 if (procarr[i * nth_per_core + j] != -1) {
4461 cnt++;
4462 }
4463 }
4464 nproc_at_core[i] = cnt;
4465 ncores_with_x_procs[cnt]++;
4466 }
4467
4468 for (int i = 0; i <= nth_per_core; i++) {
4469 for (int j = i; j <= nth_per_core; j++) {
4470 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
4471 }
4472 }
4473
4474 // Max number of processors
4475 int nproc = nth_per_core * ncores;
4476 // An array to keep number of threads per each context
4477 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4478 for (int i = 0; i < nproc; i++) {
4479 newarr[i] = 0;
4480 }
4481
4482 int nth = nthreads;
4483 int flag = 0;
4484 while (nth > 0) {
4485 for (int j = 1; j <= nth_per_core; j++) {
4486 int cnt = ncores_with_x_to_max_procs[j];
4487 for (int i = 0; i < ncores; i++) {
4488 // Skip the core with 0 processors
4489 if (nproc_at_core[i] == 0) {
4490 continue;
4491 }
4492 for (int k = 0; k < nth_per_core; k++) {
4493 if (procarr[i * nth_per_core + k] != -1) {
4494 if (newarr[i * nth_per_core + k] == 0) {
4495 newarr[i * nth_per_core + k] = 1;
4496 cnt--;
4497 nth--;
4498 break;
4499 } else {
4500 if (flag != 0) {
4501 newarr[i * nth_per_core + k]++;
4502 cnt--;
4503 nth--;
4504 break;
4505 }
4506 }
4507 }
4508 }
4509 if (cnt == 0 || nth == 0) {
4510 break;
4511 }
4512 }
4513 if (nth == 0) {
4514 break;
4515 }
4516 }
4517 flag = 1;
4518 }
4519 int sum = 0;
4520 for (int i = 0; i < nproc; i++) {
4521 sum += newarr[i];
4522 if (sum > tid) {
4523 if (fine_gran) {
4524 int osID = procarr[i];
4525 KMP_CPU_SET(osID, mask);
4526 } else {
4527 int coreID = i / nth_per_core;
4528 for (int ii = 0; ii < nth_per_core; ii++) {
4529 int osID = procarr[coreID * nth_per_core + ii];
4530 if (osID != -1) {
4531 KMP_CPU_SET(osID, mask);
4532 }
4533 }
4534 }
4535 break;
4536 }
4537 }
4538 __kmp_free(newarr);
4539 }
4540
4541 if (__kmp_affinity_verbose) {
4542 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4543 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4544 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4545 __kmp_gettid(), tid, buf);
4546 }
4547 __kmp_set_system_affinity(mask, TRUE);
4548 }
4549}
4550
4551#if KMP_OS_LINUX || KMP_OS_FREEBSD
4552// We don't need this entry for Windows because
4553// there is GetProcessAffinityMask() api
4554//
4555// The intended usage is indicated by these steps:
4556// 1) The user gets the current affinity mask
4557// 2) Then sets the affinity by calling this function
4558// 3) Error check the return value
4559// 4) Use non-OpenMP parallelization
4560// 5) Reset the affinity to what was stored in step 1)
4561#ifdef __cplusplus
4562extern "C"
4563#endif
4564 int
4565 kmp_set_thread_affinity_mask_initial()
4566// the function returns 0 on success,
4567// -1 if we cannot bind thread
4568// >0 (errno) if an error happened during binding
4569{
4570 int gtid = __kmp_get_gtid();
4571 if (gtid < 0) {
4572 // Do not touch non-omp threads
4573 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4574 "non-omp thread, returning\n"));
4575 return -1;
4576 }
4577 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
4578 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4579 "affinity not initialized, returning\n"));
4580 return -1;
4581 }
4582 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
4583 "set full mask for thread %d\n",
4584 gtid));
4585 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
4586 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
4587}
4588#endif
4589
4590#endif // KMP_AFFINITY_SUPPORTED