Äîêóìåíò âçÿò èç êýøà ïîèñêîâîé ìàøèíû. Àäðåñ îðèãèíàëüíîãî äîêóìåíòà : http://angel.cmc.msu.ru/~ifed/teraflops/Popova_OpenMP.pdf
Äàòà èçìåíåíèÿ: Thu Mar 4 12:27:31 2010
Äàòà èíäåêñèðîâàíèÿ: Mon Oct 1 19:35:41 2012
Êîäèðîâêà:
: « » - 2008 .
:
OpenMP. (OpenMP) : ..


OpenMP
· · ­ OpenMP OpenMP , Parallelization is orthogonal to the functionality
­ If the compiler does not recognize the OpenMP directives, the code remains functional (albeit single-threaded)

· · ·

: , ,
­ Intel, Microsoft, Sun, IBM, HP, ...


OpenMP


OpenMP

OpenMP




#include #include main () { int nthreads, tid; #pragma omp parallel private(nthreads, tid) { tid = omp_get_thread_num(); printf("Hello World from thread = %d\n", tid); if (tid == 0) { nthreads = omp_get_num_threads(); printf("Number of threads = %d\n", nthreads); } } }


OpenMP



· - 0 · , : - OMP_NUM_THREADS - omp_set_num_threads() · «» : - : omp_get_thread_num() - : omp_get_threads_num()


O p en M P
· ­ ­ ­ ­ · ­ ­ , , , , private Race conditions (shared, private)


C OpenMP

#pragma omp directive_name [clause[clause ...]] newline < > , , , . , . .



# pragma omp parallel { id = omp_thread_num(); res(id) = lots_of_work(id); } ............ int lots_of_work(int tid) { ......... }


parallel
. #pragma omp parallel [clause [[,]...] clause:
if (scalar_expression) private (list) shared (list) default (shared | none) firstprivate (list) reduction (operator: list) copyin (list) num_threads (integer_expression)


parallel private

private (<>)
, , , (shared). , (private). private . shared- private


firstprivate
firstprivate , private. , .

lastprivate
lastprivate , private. , work-sharing .


default
default - . : #pragma omp parallel default(private)

shared
shared . #pragma omp parallel default(private) shared(x)


reduction
reduction , «» ( ) . : #pragma omp for reduction(+ : x) : +, *, -, &, |, ^, &&, ||


threadprivate
#omp threadprivate ( ) :
static int a; f() { printf("%d\n", a); } main() { #omp threadprivate(a) #omp parallel { a = omp_num_thread(); f(); } }


copyin
copyin parallel threadprivate-: master- .


(Work-Sharing)

: for sections single

wor k_ s hare 1.gif


for
#pragma omp for [clause ...] clause: schedule (type [,chunk_size]) ordered private (list) firstprivate (list) lastprivate (list) reduction (operator: list) nowait


for
for : for(init-expr ; var logical-op b; incr-expr) init_expr logical_op Incr-expr var = expr
w or k_ s h a r e 1.gif

>, <, >=, <= ++ - += -=


incr_expr ::= var ++ ++ var var --- var var += incr var -= incr var = incr + var var = var + incr var = var ­ incr var incr, lb, b



#include #define CHUNKSIZE 100 #define N 1000 main () { int i, chunk; float a[N], b[N], c[N]; /* Some initializations */ for (i=0; i < N; i++) a[i] = b[i] = i * 1.0; chunk = CHUNKSIZE; #pragma omp parallel shared(a,b,c,chunk) private(i) { #pragma omp for for (i=0; i < N; i++){ c[i] = a[i] + b[i]; printf ("Thread %d execute loop iteration %d \n", omp_get_thread_num(),i); } } /* end of parallel section */ }



#include #define CHUNKSIZE 100 #define N 1000 main () { int i, chunk; float a[N], b[N], c[N]; /* Some initializations */ for (i=0; i < N; i++) a[i] = b[i] = i * 1.0; chunk = CHUNKSIZE; #pragma omp parallel shared(a,b,c,chunk) private(i) { #pragma omp for schedule(dynamic,chunk) nowait for (i=0; i < N; i++){ c[i] = a[i] + b[i]; printf ("Thread %d execute loop iteration %d \n", omp_get_thread_num(),i); } } /* end of parallel section */ }


shedule for
shedule : static - ; dynamic - (, , ); guided - dynamic, , , chunk, chunk ( ­ ) runtime - ( chunk )


schedule for
· chunk static, dynamic, guided · chunk 1 · runtime OMP_SCHEDULE . setenv OMP_SCHEDULE "guided,4"


#include #define CHUNK #define N main () { 10 100

int nthreads, tid, i, n, chunk; float a[N], b[N], c[N]; for (i=0; i < N; i++) a[i] = b[i] = i * 1.0; n = N; chunk = CHUNK;


#pragma omp parallel shared(a,b,c,n,chunk) \ private(i,nthreads,tid) { tid = omp_get_thread_num(); #pragma omp for schedule(dynamic,chunk) for (i=0; i < n; i++){ c[i] = a[i] + b[i]; printf("tid= %d i= %d } c[i]= %f\n", tid, i, c[i]);

if (tid == 0){ nthreads = omp_get_num_threads(); printf("Number of threads = %d\n", nthreads); } } }


sections
#pragma omp sections [clause ...] structured_block
clause: private (list) firstprivate (list) lastprivate (list) reduction (operator: list) nowait { #pragma omp section structured_block #pragma omp section structured_block }


#include #define N main () { int i, n, nthreads, tid; float a[N], b[N], c[N]; for (i=0; i < N; i++) a[i] = b[i] = i * 1.0; n = N; #pragma omp parallel shared(a,b,c,n) private(i,tid,nthreads) { tid = omp_get_thread_num(); printf("Thread %d starting...\n",tid); 50


#pragma omp sections nowait { #pragma omp section for (i=0; i < n/2; i++) { c[i] = a[i] + b[i]; printf("tid= %d i= %d c[i]= %f\n",tid,i,c[i]); } #pragma omp section for (i=n/2; i < n; i++) { c[i] = a[i] + b[i]; printf("tid= %d i= %d c[i]= %f\n",tid,i,c[i]); } }


if (tid == 0) { nthreads = omp_get_num_threads(); printf("Number of threads = %d\n", nthreads); }

} }


single
#pragma omp single [clause ...] structured_block

clause: private (list) firstprivate (list) no wa i t
single :


single
#pragma omp single shared(a,b) private (i) { #pragma omp single { a=0; printf("Single construct executed by thread %d\n", omp_get_thread_num()); }/*Barrier is automatically inserted here */ #pragma omp for for (i=0; i

parallel work-sharing
#pragma omp parallel for [clause ...] structured_block #pragma omp parallel sections [clause ...] structured_block #pragma omp single [clause ...] structured_block : #pragma omp parallel { #pragma omp for [clause ...] structured_block }



· · · · · · master critical barrier atomic flush ordered


#pragma omp master
, master

#pragma omp critical [(name)]
,

#pragma omp barrier
,


#pragma omp atomic
::== x binop = expr x ++ ++ x x --- x


#paragma omp flush [var-list]
::== x binop = expr x ++ ++ x x --- x flush: barrier, critical, ordered, parallel, for, sections, single


OpenMP
· · omp_set_dynamic(int)/omp_get_dynamic() ­ omp_set_num_threads(int)/omp_get_num_threads() ­ ­ ­ , OMP_NUM_THREADS omp_get_num_procs() ­ omp_get_thread_num() omp_set_nested(int)/omp_get_nested() ­ / omp_in_parallel() ­ ? omp_get_wtime() ­

· · · · ·


:
4.0

f(x) =

4.0 (1+x )
2

static long num_steps=100000; double step, pi; void main() { int i; double x, sum = 0.0; step = 1.0/(double) num_steps; for (i=0; i< num_steps; i++){ x = (i+0.5)*step; sum = sum + 4.0/(1.0 + x*x); } pi = step * sum; printf("Pi = %f\n",pi); }

2.0

0.0

X

1.0


Computing Pi through integration
static long num_steps=100000; double step, pi; void main() { int i; double x, sum = 0.0; step = 1.0/(double) num_steps; for (i=0; i< num_steps; i++){ x = (i+0.5)*step; sum = sum + 4.0/(1.0 + x*x); } pi = step * sum; printf("Pi = %f\n",pi); }

· shared? · private? · reductions?


Pi
static long num_steps=100000; double step, pi; void main() { int i; double x, sum = 0.0; step = 1.0/(double) num_steps; #pragma omp parallel for \ private(x) reduction(+:sum) for (i=0; i< num_steps; i++){ x = (i+0.5)*step; sum = sum + 4.0/(1.0 + x*x); } pi = step * sum; printf("Pi = %f\n",pi); }

i is private since it is the loop variable


OpenMP
R eg a tta


OpenMP
· xlc_r ­ , · xlC_r ­ C++ , · ­o xlc ( -I include_dir, -L library_dir, -l library_name, -c, -g . MPI). · IBM: · -qsmp=omp OpenMP . · -qsmp=auto SMP

,


OpenMP-
:
xlc_r -qsmp=omp - -qarch=pwr4 -q64 -o my_c_program *.c

, *.c 64- my__program, , OpenMP.


OpenMP- (Regatta)
ompsubmit [< ompsubmit>] < ­ > [< >] (, [] ) ompsubmit , mpisubmit (. ). hello , 1 , : ompsubmit -w 01:00 -n 4 hello