/* Test and benchmark of a couple of parallel sorting algorithms.
Copyright (C) 2008 Free Software Foundation, Inc.
GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
. */
#include
#include
#include
#include
#include
#include
int failures;
#define THRESHOLD 100
static void
verify (const char *name, double stime, int *array, int count)
{
int i;
double etime = omp_get_wtime ();
printf ("%s: %g\n", name, etime - stime);
for (i = 1; i < count; i++)
if (array[i] < array[i - 1])
{
printf ("%s: incorrectly sorted\n", name);
failures = 1;
}
}
static void
insertsort (int *array, int s, int e)
{
int i, j, val;
for (i = s + 1; i <= e; i++)
{
val = array[i];
j = i;
while (j-- > s && val < array[j])
array[j + 1] = array[j];
array[j + 1] = val;
}
}
struct int_pair
{
int lo;
int hi;
};
struct int_pair_stack
{
struct int_pair *top;
#define STACK_SIZE 4 * CHAR_BIT * sizeof (int)
struct int_pair arr[STACK_SIZE];
};
static inline void
init_int_pair_stack (struct int_pair_stack *stack)
{
stack->top = &stack->arr[0];
}
static inline void
push_int_pair_stack (struct int_pair_stack *stack, int lo, int hi)
{
stack->top->lo = lo;
stack->top->hi = hi;
stack->top++;
}
static inline void
pop_int_pair_stack (struct int_pair_stack *stack, int *lo, int *hi)
{
stack->top--;
*lo = stack->top->lo;
*hi = stack->top->hi;
}
static inline int
size_int_pair_stack (struct int_pair_stack *stack)
{
return stack->top - &stack->arr[0];
}
static inline void
busy_wait (void)
{
#if defined __i386__ || defined __x86_64__
__asm volatile ("rep; nop" : : : "memory");
#elif defined __ia64__
__asm volatile ("hint @pause" : : : "memory");
#elif defined __sparc__ && (defined __arch64__ || defined __sparc_v9__)
__asm volatile ("membar #LoadLoad" : : : "memory");
#else
__asm volatile ("" : : : "memory");
#endif
}
static inline void
swap (int *array, int a, int b)
{
int val = array[a];
array[a] = array[b];
array[b] = val;
}
static inline int
choose_pivot (int *array, int lo, int hi)
{
int mid = (lo + hi) / 2;
if (array[mid] < array[lo])
swap (array, lo, mid);
if (array[hi] < array[mid])
{
swap (array, mid, hi);
if (array[mid] < array[lo])
swap (array, lo, mid);
}
return array[mid];
}
static inline int
partition (int *array, int lo, int hi)
{
int pivot = choose_pivot (array, lo, hi);
int left = lo;
int right = hi;
for (;;)
{
while (array[++left] < pivot);
while (array[--right] > pivot);
if (left >= right)
break;
swap (array, left, right);
}
return left;
}
static void
sort1 (int *array, int count)
{
omp_lock_t lock;
struct int_pair_stack global_stack;
int busy = 1;
int num_threads;
omp_init_lock (&lock);
init_int_pair_stack (&global_stack);
#pragma omp parallel firstprivate (array, count)
{
int lo = 0, hi = 0, mid, next_lo, next_hi;
bool idle = true;
struct int_pair_stack local_stack;
init_int_pair_stack (&local_stack);
if (omp_get_thread_num () == 0)
{
num_threads = omp_get_num_threads ();
hi = count - 1;
idle = false;
}
for (;;)
{
if (hi - lo < THRESHOLD)
{
insertsort (array, lo, hi);
lo = hi;
}
if (lo >= hi)
{
if (size_int_pair_stack (&local_stack) == 0)
{
again:
omp_set_lock (&lock);
if (size_int_pair_stack (&global_stack) == 0)
{
if (!idle)
busy--;
if (busy == 0)
{
omp_unset_lock (&lock);
break;
}
omp_unset_lock (&lock);
idle = true;
while (size_int_pair_stack (&global_stack) == 0
&& busy)
busy_wait ();
goto again;
}
if (idle)
busy++;
pop_int_pair_stack (&global_stack, &lo, &hi);
omp_unset_lock (&lock);
idle = false;
}
else
pop_int_pair_stack (&local_stack, &lo, &hi);
}
mid = partition (array, lo, hi);
if (mid - lo < hi - mid)
{
next_lo = mid;
next_hi = hi;
hi = mid - 1;
}
else
{
next_lo = lo;
next_hi = mid - 1;
lo = mid;
}
if (next_hi - next_lo < THRESHOLD)
insertsort (array, next_lo, next_hi);
else
{
if (size_int_pair_stack (&global_stack) < num_threads - 1)
{
int size;
omp_set_lock (&lock);
size = size_int_pair_stack (&global_stack);
if (size < num_threads - 1 && size < STACK_SIZE)
push_int_pair_stack (&global_stack, next_lo, next_hi);
else
push_int_pair_stack (&local_stack, next_lo, next_hi);
omp_unset_lock (&lock);
}
else
push_int_pair_stack (&local_stack, next_lo, next_hi);
}
}
}
omp_destroy_lock (&lock);
}
static void
sort2_1 (int *array, int lo, int hi, int num_threads, int *busy)
{
int mid;
if (hi - lo < THRESHOLD)
{
insertsort (array, lo, hi);
return;
}
mid = partition (array, lo, hi);
if (*busy >= num_threads)
{
sort2_1 (array, lo, mid - 1, num_threads, busy);
sort2_1 (array, mid, hi, num_threads, busy);
return;
}
#pragma omp atomic
*busy += 1;
#pragma omp parallel num_threads (2) \
firstprivate (array, lo, hi, mid, num_threads, busy)
{
if (omp_get_thread_num () == 0)
sort2_1 (array, lo, mid - 1, num_threads, busy);
else
{
sort2_1 (array, mid, hi, num_threads, busy);
#pragma omp atomic
*busy -= 1;
}
}
}
static void
sort2 (int *array, int count)
{
int num_threads;
int busy = 1;
#pragma omp parallel
#pragma omp single nowait
num_threads = omp_get_num_threads ();
sort2_1 (array, 0, count - 1, num_threads, &busy);
}
#if _OPENMP >= 200805
static void
sort3_1 (int *array, int lo, int hi)
{
int mid;
if (hi - lo < THRESHOLD)
{
insertsort (array, lo, hi);
return;
}
mid = partition (array, lo, hi);
#pragma omp task
sort3_1 (array, lo, mid - 1);
sort3_1 (array, mid, hi);
}
static void
sort3 (int *array, int count)
{
#pragma omp parallel
#pragma omp single
sort3_1 (array, 0, count - 1);
}
#endif
int
main (int argc, char **argv)
{
int i, count = 1000000;
double stime;
int *unsorted, *sorted, num_threads;
if (argc >= 2)
count = strtoul (argv[1], NULL, 0);
unsorted = malloc (count * sizeof (int));
sorted = malloc (count * sizeof (int));
if (unsorted == NULL || sorted == NULL)
{
puts ("allocation failure");
exit (1);
}
srand (0xdeadbeef);
for (i = 0; i < count; i++)
unsorted[i] = rand ();
omp_set_nested (1);
omp_set_dynamic (0);
#pragma omp parallel
#pragma omp single nowait
num_threads = omp_get_num_threads ();
printf ("Threads: %d\n", num_threads);
memcpy (sorted, unsorted, count * sizeof (int));
stime = omp_get_wtime ();
sort1 (sorted, count);
verify ("sort1", stime, sorted, count);
memcpy (sorted, unsorted, count * sizeof (int));
stime = omp_get_wtime ();
sort2 (sorted, count);
verify ("sort2", stime, sorted, count);
#if _OPENMP >= 200805
memcpy (sorted, unsorted, count * sizeof (int));
stime = omp_get_wtime ();
sort3 (sorted, count);
verify ("sort3", stime, sorted, count);
#endif
return 0;
}