-
-
Notifications
You must be signed in to change notification settings - Fork 30.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
os.sched_yield() + range() unexpected influence #121512
Comments
P.S. for i in range(arg):
cnt += 1
queue.put(True) gives the same, but in range |
#96078 ? |
@Eclips4 would you mind to assign this issue for me? |
Problem solved.. I guess you use Intel CPU( >= 10th?) sched_yield may move the process to a different CPU core. so we got different speed on the power core and efficiency core you can bind the core from os import sched_yield
from queue import SimpleQueue
from threading import Thread
from time import monotonic
period = 2
import os
pid = os.getpid()
cpu_set = {1}
mask = 0
for cpu in cpu_set:
mask |= (1 << cpu)
os.sched_setaffinity(pid, cpu_set)
affinity = os.sched_getaffinity(pid)
print(f"Current CPU affinity: {affinity}")
def _thread(queue1: SimpleQueue) -> None:
consumed = 0
start = monotonic()
while queue1.get():
consumed += 1
end = monotonic()
print(f'Consumed: {consumed / 1000 / (end - start):12.2f} Kitems/sec.')
def main():
for arg in (1, 10, 100, 1000, 10_000, 100_000, 1000_000, 10_000_000):
print(f'Testing for {arg=}')
queue: SimpleQueue[bool] = SimpleQueue()
thread = Thread(target=_thread, args=(queue,), daemon=True)
thread.start()
cnt = 0
deadline = monotonic() + period
while monotonic() < deadline:
for i in range(arg):
cnt += 1
if queue.qsize() <= 1048576:
queue.put(True)
sched_yield() # with commented out - no bugs for any `range()`
# time.sleep(0) # no bugs for any `range()`
stop = monotonic()
queue.put(False)
thread.join()
print(f'Produced: {cnt / 1000 / (stop - (deadline - period)):12.2f} Kitems/sec.')
main() |
Yes, |
You can try |
I using AMD 7950X, using first test case, the result is same as origin issue.
using
If I remember correctly, AMD cpu doesn't have power core and efficiency core?
|
Yes you are right let's find more detail about this |
Interesting, I write a C code For no cpu binding version #include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <stdbool.h>
#include <sys/time.h>
#include <sched.h>
#include <errno.h>
#define QUEUE_SIZE 1048576
#define PERIOD 2
#define WAKEUP_INTERVAL 100
typedef struct {
bool buffer[QUEUE_SIZE];
int head;
int tail;
pthread_mutex_t lock;
} SimpleQueue;
void init_queue(SimpleQueue *queue) {
queue->head = 0;
queue->tail = 0;
pthread_mutex_init(&queue->lock, NULL);
for (int i = 0; i < QUEUE_SIZE; i++) {
queue->buffer[i] = false;
}
}
bool enqueue(SimpleQueue *queue, bool value) {
pthread_mutex_lock(&queue->lock);
int next_tail = (queue->tail + 1) % QUEUE_SIZE;
if (next_tail == queue->head) {
// Queue is full
pthread_mutex_unlock(&queue->lock);
return false;
}
queue->buffer[queue->tail] = value;
queue->tail = next_tail;
pthread_mutex_unlock(&queue->lock);
return true;
}
bool dequeue(SimpleQueue *queue, bool *value) {
pthread_mutex_lock(&queue->lock);
if (queue->head == queue->tail) {
// Queue is empty
pthread_mutex_unlock(&queue->lock);
return false;
}
*value = queue->buffer[queue->head];
queue->head = (queue->head + 1) % QUEUE_SIZE;
pthread_mutex_unlock(&queue->lock);
return true;
}
double monotonic_time() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec / 1000000000.0;
}
void *thread_func(void *arg) {
SimpleQueue *queue = (SimpleQueue *)arg;
int consumed = 0;
double start = monotonic_time();
bool value;
while (true) {
if (dequeue(queue, &value) && value) {
consumed++;
} else {
break; // exit loop when dequeue returns false or value is false
}
}
double end = monotonic_time();
printf("Consumed: %12.2f Kitems/sec.\n", consumed / 1000.0 / (end - start));
return NULL;
}
int main() {
int args[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000};
for (int i = 0; i < sizeof(args) / sizeof(args[0]); i++) {
int arg = args[i];
printf("Testing for arg=%d\n", arg);
SimpleQueue queue;
init_queue(&queue);
pthread_t thread;
pthread_create(&thread, NULL, thread_func, &queue);
int cnt = 0;
double deadline = monotonic_time() + PERIOD;
while (monotonic_time() < deadline) {
for (int j = 0; j < arg; j++) {
cnt++;
if (enqueue(&queue, true)) {
if (cnt % WAKEUP_INTERVAL == 0) {
sched_yield();
}
}
}
}
double stop = monotonic_time();
enqueue(&queue, false);
pthread_join(thread, NULL);
printf("Produced: %12.2f Kitems/sec.\n", cnt / 1000.0 / (stop - (deadline - PERIOD)));
}
return 0;
} The result here
For CPU binding version #define _GNU_SOURCE
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <stdbool.h>
#include <sys/time.h>
#include <sched.h>
#include <errno.h>
#define QUEUE_SIZE 1048576
#define PERIOD 2
#define WAKEUP_INTERVAL 100
typedef struct {
bool buffer[QUEUE_SIZE];
int head;
int tail;
pthread_mutex_t lock;
} SimpleQueue;
void init_queue(SimpleQueue *queue) {
queue->head = 0;
queue->tail = 0;
pthread_mutex_init(&queue->lock, NULL);
for (int i = 0; i < QUEUE_SIZE; i++) {
queue->buffer[i] = false;
}
}
bool enqueue(SimpleQueue *queue, bool value) {
pthread_mutex_lock(&queue->lock);
int next_tail = (queue->tail + 1) % QUEUE_SIZE;
if (next_tail == queue->head) {
// Queue is full
pthread_mutex_unlock(&queue->lock);
return false;
}
queue->buffer[queue->tail] = value;
queue->tail = next_tail;
pthread_mutex_unlock(&queue->lock);
return true;
}
bool dequeue(SimpleQueue *queue, bool *value) {
pthread_mutex_lock(&queue->lock);
if (queue->head == queue->tail) {
// Queue is empty
pthread_mutex_unlock(&queue->lock);
return false;
}
*value = queue->buffer[queue->head];
queue->head = (queue->head + 1) % QUEUE_SIZE;
pthread_mutex_unlock(&queue->lock);
return true;
}
double monotonic_time() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec / 1000000000.0;
}
void *thread_func(void *arg) {
SimpleQueue *queue = (SimpleQueue *)arg;
int consumed = 0;
double start = monotonic_time();
bool value;
while (true) {
if (dequeue(queue, &value) && value) {
consumed++;
} else {
break; // exit loop when dequeue returns false or value is false
}
}
double end = monotonic_time();
printf("Consumed: %12.2f Kitems/sec.\n", consumed / 1000.0 / (end - start));
return NULL;
}
void set_cpu_affinity() {
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(1, &mask);
if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
perror("sched_setaffinity");
exit(EXIT_FAILURE);
}
}
int main() {
set_cpu_affinity();
cpu_set_t mask;
CPU_ZERO(&mask);
if (sched_getaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
perror("sched_getaffinity");
exit(EXIT_FAILURE);
}
printf("Current CPU affinity: ");
for (int i = 0; i < CPU_SETSIZE; i++) {
if (CPU_ISSET(i, &mask)) {
printf("%d ", i);
}
}
printf("\n");
int args[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000};
for (int i = 0; i < sizeof(args) / sizeof(args[0]); i++) {
int arg = args[i];
printf("Testing for arg=%d\n", arg);
SimpleQueue queue;
init_queue(&queue);
pthread_t thread;
pthread_create(&thread, NULL, thread_func, &queue);
int cnt = 0;
double deadline = monotonic_time() + PERIOD;
while (monotonic_time() < deadline) {
for (int j = 0; j < arg; j++) {
cnt++;
if (enqueue(&queue, true)) {
if (cnt % WAKEUP_INTERVAL == 0) {
sched_yield();
}
}
}
}
double stop = monotonic_time();
enqueue(&queue, false);
pthread_join(thread, NULL);
printf("Produced: %12.2f Kitems/sec.\n", cnt / 1000.0 / (stop - (deadline - PERIOD)));
}
return 0;
} The result here
|
@Eclips4 would you mind helping me reopen this issue? I think we do not find root cause yet. |
Do you think the problem is on the CPython side? |
99.999% not in CPython side... lol |
If this isn't on the CPython side, then I think this issue doesn't need to be reopened. :) |
Got it, I will try to get more detail about this and I may ask you for reopen this if I find the issue in CPython side |
Bug report
Bug description:
The counters printed are expected to be almost the same. They are. But some arguments for
range()
,sched_yield()
significantly change the picture. Note,sched_yield()
slows down ANOTHER thread, and unexpectedly speeds up the current one.CPython versions tested on:
3.12
Operating systems tested on:
Linux
The text was updated successfully, but these errors were encountered: