2 Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #ifndef OFFLOAD_ENGINE_H_INCLUDED
32 #define OFFLOAD_ENGINE_H_INCLUDED
39 #include "offload_common.h"
40 #include "coi/coi_client.h"
42 #define SIGNAL_IS_REMOVED ((OffloadDescriptor *)-1)
43 const int64_t no_stream
= -1;
48 MemRange() : m_start(0), m_length(0) {}
49 MemRange(const void *addr
, uint64_t len
) : m_start(addr
), m_length(len
) {}
51 const void* start() const {
55 const void* end() const {
56 return static_cast<const char*>(m_start
) + m_length
;
59 uint64_t length() const {
63 // returns true if given range overlaps with another one
64 bool overlaps(const MemRange
&o
) const {
65 // Two address ranges A[start, end) and B[start,end) overlap
66 // if A.start < B.end and A.end > B.start.
67 return start() < o
.end() && end() > o
.start();
70 // returns true if given range contains the other range
71 bool contains(const MemRange
&o
) const {
72 return start() <= o
.start() && o
.end() <= end();
80 // Data associated with a pointer variable
83 PtrData(const void *addr
, uint64_t len
) :
84 cpu_addr(addr
, len
), cpu_buf(0),
85 mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
86 ref_count(0), is_static(false)
92 PtrData(const PtrData
& ptr
):
93 cpu_addr(ptr
.cpu_addr
), cpu_buf(ptr
.cpu_buf
),
94 mic_addr(ptr
.mic_addr
), alloc_disp(ptr
.alloc_disp
),
95 mic_buf(ptr
.mic_buf
), mic_offset(ptr
.mic_offset
),
96 ref_count(ptr
.ref_count
), is_static(ptr
.is_static
)
99 bool operator<(const PtrData
&o
) const {
100 // Variables are sorted by the CPU start address.
101 // Overlapping memory ranges are considered equal.
102 return (cpu_addr
.start() < o
.cpu_addr
.start()) &&
103 !cpu_addr
.overlaps(o
.cpu_addr
);
106 long add_reference() {
111 return __sync_fetch_and_add(&ref_count
, 1);
112 #else // TARGET_WINNT
113 return _InterlockedIncrement(&ref_count
) - 1;
114 #endif // TARGET_WINNT
117 long remove_reference() {
122 return __sync_sub_and_fetch(&ref_count
, 1);
123 #else // TARGET_WINNT
124 return _InterlockedDecrement(&ref_count
);
125 #endif // TARGET_WINNT
128 long get_reference() const {
137 const MemRange cpu_addr
;
139 // CPU and MIC buffers
143 // placeholder for buffer address on mic
148 // additional offset to pointer data on MIC for improving bandwidth for
149 // data which is not 4K aligned
152 // if true buffers are created from static memory
154 mutex_t alloc_ptr_data_lock
;
157 // reference count for the entry
161 typedef std::list
<PtrData
*> PtrDataList
;
165 typedef std::set
<PtrData
> PtrSet
;
167 PtrData
* find_ptr_data(const void *ptr
) {
169 PtrSet::iterator res
= list
.find(PtrData(ptr
, 0));
172 if (res
== list
.end()) {
175 return const_cast<PtrData
*>(res
.operator->());
178 PtrData
* insert_ptr_data(const void *ptr
, uint64_t len
, bool &is_new
) {
180 std::pair
<PtrSet::iterator
, bool> res
=
181 list
.insert(PtrData(ptr
, len
));
183 PtrData
* ptr_data
= const_cast<PtrData
*>(res
.first
.operator->());
188 // It's necessary to lock as soon as possible.
189 // unlock must be done at call site of insert_ptr_data at
191 ptr_data
->alloc_ptr_data_lock
.lock();
196 void remove_ptr_data(const void *ptr
) {
198 list
.erase(PtrData(ptr
, 0));
207 // Data associated with automatic variable
210 AutoData(const void *addr
, uint64_t len
) :
211 cpu_addr(addr
, len
), ref_count(0)
214 bool operator<(const AutoData
&o
) const {
215 // Variables are sorted by the CPU start address.
216 // Overlapping memory ranges are considered equal.
217 return (cpu_addr
.start() < o
.cpu_addr
.start()) &&
218 !cpu_addr
.overlaps(o
.cpu_addr
);
221 long add_reference() {
223 return __sync_fetch_and_add(&ref_count
, 1);
224 #else // TARGET_WINNT
225 return _InterlockedIncrement(&ref_count
) - 1;
226 #endif // TARGET_WINNT
229 long remove_reference() {
231 return __sync_sub_and_fetch(&ref_count
, 1);
232 #else // TARGET_WINNT
233 return _InterlockedDecrement(&ref_count
);
234 #endif // TARGET_WINNT
237 long nullify_reference() {
239 return __sync_lock_test_and_set(&ref_count
, 0);
240 #else // TARGET_WINNT
241 return _InterlockedExchange(&ref_count
,0);
242 #endif // TARGET_WINNT
245 long get_reference() const {
251 const MemRange cpu_addr
;
254 // reference count for the entry
258 // Set of autimatic variables
259 typedef std::set
<AutoData
> AutoSet
;
264 TargetImage(const char *_name
, const void *_data
, uint64_t _size
,
265 const char *_origin
, uint64_t _offset
) :
266 name(_name
), data(_data
), size(_size
),
267 origin(_origin
), offset(_offset
)
277 // file of origin and offset within that file
282 typedef std::list
<TargetImage
> TargetImageList
;
284 // dynamic library and Image associated with lib
287 DynLib(const char *_name
, const void *_data
,
289 name(_name
), data(_data
), lib(_lib
)
299 typedef std::list
<DynLib
> DynLibList
;
301 // Data associated with persistent auto objects
304 PersistData(const void *addr
, uint64_t routine_num
,
305 uint64_t size
, uint64_t thread
) :
306 stack_cpu_addr(addr
), routine_id(routine_num
), thread_id(thread
)
308 stack_ptr_data
= new PtrData(0, size
);
310 // 1-st key value - beginning of the stack at CPU
311 const void * stack_cpu_addr
;
312 // 2-nd key value - identifier of routine invocation at CPU
314 // 3-rd key value - thread identifier
317 // corresponded PtrData; only stack_ptr_data->mic_buf is used
318 PtrData
* stack_ptr_data
;
319 // used to get offset of the variable in stack buffer
320 char * cpu_stack_addr
;
323 typedef std::list
<PersistData
> PersistDataList
;
325 // Data associated with stream
328 Stream(int device
, int num_of_cpus
) :
329 m_number_of_cpus(num_of_cpus
), m_pipeline(0), m_last_offload(0),
334 COI::PipelineDestroy(m_pipeline
);
338 COIPIPELINE
get_pipeline(void) {
342 int get_device(void) {
346 int get_cpu_number(void) {
347 return(m_number_of_cpus
);
350 void set_pipeline(COIPIPELINE pipeline
) {
351 m_pipeline
= pipeline
;
354 OffloadDescriptor
* get_last_offload(void) {
355 return(m_last_offload
);
358 void set_last_offload(OffloadDescriptor
* last_offload
) {
359 m_last_offload
= last_offload
;
362 static Stream
* find_stream(uint64_t handle
, bool remove
);
364 static _Offload_stream
add_stream(int device
, int number_of_cpus
) {
365 m_stream_lock
.lock();
366 all_streams
[++m_streams_count
] = new Stream(device
, number_of_cpus
);
367 m_stream_lock
.unlock();
368 return(m_streams_count
);
371 typedef std::map
<uint64_t, Stream
*> StreamMap
;
373 static uint64_t m_streams_count
;
374 static StreamMap all_streams
;
375 static mutex_t m_stream_lock
;
380 int m_number_of_cpus
;
382 // The pipeline associated with the stream
383 COIPIPELINE m_pipeline
;
385 // The last offload occured via the stream
386 OffloadDescriptor
* m_last_offload
;
388 // Cpus used by the stream
389 std::bitset
<COI_MAX_HW_THREADS
> m_stream_cpus
;
392 typedef std::map
<uint64_t, Stream
*> StreamMap
;
394 // class representing a single engine
396 friend void __offload_init_library_once(void);
397 friend void __offload_fini_library(void);
399 #define check_result(res, tag, ...) \
401 if (res == COI_PROCESS_DIED) { \
402 fini_process(true); \
405 if (res != COI_SUCCESS) { \
406 __liboffload_error_support(tag, __VA_ARGS__); \
411 int get_logical_index() const {
415 int get_physical_index() const {
416 return m_physical_index
;
419 const COIPROCESS
& get_process() const {
423 uint64_t get_thread_id(void);
429 void unload_library(const void *data
, const char *name
);
432 void add_lib(const TargetImage
&lib
)
436 m_images
.push_back(lib
);
441 _Offload_stream stream
,
442 const std::list
<COIBUFFER
> &buffers
,
448 const COIEVENT
* deps
,
453 // temporary workaround for blocking behavior for myoiLibInit/Fini calls
454 void init_myo(COIEVENT
*event
) {
456 res
= COI::PipelineRunFunction(get_pipeline(),
457 m_funcs
[c_func_myo_init
],
458 0, 0, 0, 0, 0, 0, 0, 0, 0,
460 check_result(res
, c_pipeline_run_func
, m_index
, res
);
463 void fini_myo(COIEVENT
*event
) {
465 res
= COI::PipelineRunFunction(get_pipeline(),
466 m_funcs
[c_func_myo_fini
],
467 0, 0, 0, 0, 0, 0, 0, 0, 0,
469 check_result(res
, c_pipeline_run_func
, m_index
, res
);
471 #endif // MYO_SUPPORT
474 // Memory association table
476 PtrData
* find_ptr_data(const void *ptr
) {
477 return m_ptr_set
.find_ptr_data(ptr
);
480 PtrData
* find_targetptr_data(const void *ptr
) {
481 return m_targetptr_set
.find_ptr_data(ptr
);
484 PtrData
* insert_ptr_data(const void *ptr
, uint64_t len
, bool &is_new
) {
485 return m_ptr_set
.insert_ptr_data(ptr
, len
, is_new
);
488 PtrData
* insert_targetptr_data(const void *ptr
, uint64_t len
,
490 return m_targetptr_set
.insert_ptr_data(ptr
, len
, is_new
);
493 void remove_ptr_data(const void *ptr
) {
494 m_ptr_set
.remove_ptr_data(ptr
);
497 void remove_targetptr_data(const void *ptr
) {
498 m_targetptr_set
.remove_ptr_data(ptr
);
502 // Automatic variables
504 AutoData
* find_auto_data(const void *ptr
) {
505 AutoSet
&auto_vars
= get_auto_vars();
506 AutoSet::iterator res
= auto_vars
.find(AutoData(ptr
, 0));
507 if (res
== auto_vars
.end()) {
510 return const_cast<AutoData
*>(res
.operator->());
513 AutoData
* insert_auto_data(const void *ptr
, uint64_t len
) {
514 AutoSet
&auto_vars
= get_auto_vars();
515 std::pair
<AutoSet::iterator
, bool> res
=
516 auto_vars
.insert(AutoData(ptr
, len
));
517 return const_cast<AutoData
*>(res
.first
.operator->());
520 void remove_auto_data(const void *ptr
) {
521 get_auto_vars().erase(AutoData(ptr
, 0));
527 void add_signal(const void *signal
, OffloadDescriptor
*desc
) {
528 m_signal_lock
.lock();
529 m_signal_map
[signal
] = desc
;
530 m_signal_lock
.unlock();
533 OffloadDescriptor
* find_signal(const void *signal
, bool remove
) {
534 OffloadDescriptor
*desc
= 0;
536 m_signal_lock
.lock();
538 SignalMap::iterator it
= m_signal_map
.find(signal
);
539 if (it
!= m_signal_map
.end()) {
542 it
->second
= SIGNAL_IS_REMOVED
;
546 m_signal_lock
.unlock();
551 void stream_destroy(_Offload_stream handle
);
553 COIPIPELINE
get_pipeline(_Offload_stream stream
);
555 StreamMap
get_stream_map() {
559 // stop device process
560 void fini_process(bool verbose
);
562 // list of stacks active at the engine
563 PersistDataList m_persist_list
;
566 Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
571 for (StreamMap::iterator it
= m_stream_map
.begin();
572 it
!= m_stream_map
.end(); it
++) {
573 Stream
* stream
= it
->second
;
576 if (m_process
!= 0) {
582 void set_indexes(int logical_index
, int physical_index
) {
583 m_index
= logical_index
;
584 m_physical_index
= physical_index
;
587 // start process on device
590 void load_libraries(void);
591 void init_ptr_data(void);
593 // performs library intialization on the device side
594 pid_t
init_device(void);
597 // get pipeline associated with a calling thread
598 COIPIPELINE
get_pipeline(void);
600 // get automatic vars set associated with the calling thread
601 AutoSet
& get_auto_vars(void);
603 // destructor for thread data
604 static void destroy_thread_data(void *data
);
607 typedef std::set
<PtrData
> PtrSet
;
608 typedef std::map
<const void*, OffloadDescriptor
*> SignalMap
;
612 int m_physical_index
;
614 // number of COI pipes created for the engine
618 COIPROCESS m_process
;
620 // If false, device either has not been initialized or new libraries
625 // List of libraries to be loaded
626 TargetImageList m_images
;
629 PtrDataTable m_ptr_set
;
630 PtrDataTable m_targetptr_set
;
633 SignalMap m_signal_map
;
634 mutex_t m_signal_lock
;
637 StreamMap m_stream_map
;
638 mutex_t m_stream_lock
;
641 std::bitset
<COI_MAX_HW_THREADS
> m_cpus
;
643 // List of dynamic libraries to be registred
644 DynLibList m_dyn_libs
;
646 // constants for accessing device function handles
652 #endif // MYO_SUPPORT
654 c_func_var_table_size
,
655 c_func_var_table_copy
,
656 c_func_set_stream_affinity
,
659 static const char* m_func_names
[c_funcs_total
];
661 // device function handles
662 COIFUNCTION m_funcs
[c_funcs_total
];
664 // int -> name mapping for device signals
665 static const int c_signal_max
= 32;
666 static const char* c_signal_names
[c_signal_max
];
669 #endif // OFFLOAD_ENGINE_H_INCLUDED