-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add bug triggering program in experiments.
It's a bug in pony, I only slightly modified counter.c such that several threads send to one actor. The actor mailbox has a race that leads to a crash. We'll need Sylvain's help to fix it without hurting performance terribly.
- Loading branch information
Stephan Brandauer
committed
Jun 12, 2014
1 parent
007105d
commit 0c2dcf6
Showing
2 changed files
with
117 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#include <pony/pony.h> | ||
#include <stdlib.h> | ||
#include <stdio.h> | ||
|
||
typedef struct counter_t | ||
{ | ||
uint64_t count; | ||
} counter_t; | ||
|
||
enum | ||
{ | ||
MSG_INIT, | ||
MSG_INC, | ||
MSG_SHOOT_ME, | ||
MSG_GETANDRESET, | ||
MSG_RESPONSE | ||
}; | ||
|
||
static pony_msg_t m_init = {0, {{NULL, 0, PONY_PRIMITIVE}}}; | ||
static pony_msg_t m_inc = {0, {{NULL, 0, PONY_PRIMITIVE}}}; | ||
static pony_msg_t m_shootme = {1, {{NULL, 0, PONY_ACTOR}}}; | ||
static pony_msg_t m_getandreset = {1, {{NULL, 0, PONY_ACTOR}}}; | ||
static pony_msg_t m_response = {1, {{NULL, 8, PONY_PRIMITIVE}}}; | ||
|
||
static void trace(void* p) | ||
{ | ||
counter_t* d = p; | ||
pony_trace64(&d->count); | ||
} | ||
|
||
static pony_msg_t* message_type(uint64_t id) | ||
{ | ||
switch(id) | ||
{ | ||
case MSG_INIT: return &m_init; | ||
case MSG_INC: return &m_inc; | ||
case MSG_GETANDRESET: return &m_getandreset; | ||
case MSG_RESPONSE: return &m_response; | ||
case MSG_SHOOT_ME: return &m_shootme; | ||
} | ||
|
||
return NULL; | ||
} | ||
|
||
static void dispatch(pony_actor_t* this, void* p, uint64_t id, int argc, pony_arg_t* argv); | ||
|
||
static pony_actor_type_t type = | ||
{ | ||
1, | ||
{trace, sizeof(counter_t), PONY_ACTOR}, | ||
message_type, | ||
dispatch | ||
}; | ||
|
||
static void dispatch(pony_actor_t* this, void* p, uint64_t id, int argc, pony_arg_t* argv) | ||
{ | ||
counter_t* d = p; | ||
|
||
switch(id) | ||
{ | ||
case PONY_MAIN: | ||
{ | ||
printf("starting..\n"); | ||
//int margc = argv[0].i; | ||
//char** margv = argv[1].p; | ||
//uint64_t count = 0; | ||
|
||
const int N = 20; | ||
pony_actor_t *actors[N]; | ||
for (int i = 0; i<N; ++i) { | ||
actors[i] = pony_create(&type); | ||
pony_arg_t arg = {.p = this }; | ||
pony_sendp(actors[i], MSG_SHOOT_ME, &arg); | ||
|
||
//pony_sendp(actor, MSG_GETANDRESET, this); | ||
} | ||
|
||
break; | ||
} | ||
case MSG_SHOOT_ME: | ||
{ | ||
printf("shooting..\n"); | ||
pony_actor_t *me = argv[1].p; | ||
for (int i=0; i < 1000; i++) { | ||
pony_send(me, MSG_INC); | ||
} | ||
break; | ||
} | ||
case MSG_INIT: | ||
d = pony_alloc(sizeof(counter_t)); | ||
d->count = 0; | ||
pony_set(d); | ||
break; | ||
|
||
case MSG_INC: | ||
d->count++; | ||
printf("count = %llu\n",d->count); | ||
break; | ||
|
||
case MSG_GETANDRESET: | ||
pony_sendi(argv[0].p, MSG_RESPONSE, d->count); | ||
d->count = 0; | ||
break; | ||
|
||
case MSG_RESPONSE: | ||
printf("%lu\n", argv[0].i); | ||
break; | ||
} | ||
} | ||
|
||
int main(int argc, char** argv) | ||
{ | ||
return pony_start(argc, argv, pony_create(&type)); | ||
} |
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you describe the error a little? Or possibly a terminal dump/gdb backtrace?
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One more thing! Please verify all such things with the unmodified pony runtime!
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can run it (there's a Makefile). It's the one I told you about on skype on Wednesday:
"when mpmcq_pop is reading into
next
-- i don't see how there's any synchronisation going on". https://github.com/parapluu/mylittlepony/blame/master/src/runtime/pony/src/mpmcq.c#L560c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
EDIT This is with an unmodified Pony runtime.
BTW: When I run this program using just ONE thread, the program still crashes.
Look at the address of actor in actor_run. 0x100 seems like a null-pointer dereference on a large struct.
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@kaeluka I don't think any synchronisation is needed at line 56. And the loop does synchronise at the end of the while loop on https://github.com/parapluu/mylittlepony/blame/master/src/runtime/pony/src/mpmcq.c#L59.
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@TobiasWrigstad How is the dereference of next->data ordered after the write on line 37?
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here are the fixes to your program:
pony_send(this, MSG_INIT);
at the beginning ofPONY_MAIN
. Otherwise you'll get a crash when you get shot because your actor does not have state yet.pony_arg_t arg = {.p = this };
andpony_sendp(actors[i], MSG_SHOOT_ME, &arg);
withpony_sendp(actors[i], MSG_SHOOT_ME, this);
. Your code sends the address to the argument list, not the address to the argument.pony_actor_t *me = argv[1].p;
topony_actor_t *me = argv[0].p;
-- classic!Now the program runs just fine!
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unless I am wrong, the empty list has one element in it. You take from one end and you insert at the other. There is only one thread popping elements, but multiple threads pushing.
Line 56 reads the data element of the first element of the list. The only way there is a race between push and pop is when the list is empty, i.e, pushing concurrently creates the very element that pop is always trying to remove. In this case, there are two possibilities, if thread the pop-thread is first, it will see the list is empty, so no problem. If the push-thread is first, pop will see the new element, and then replace the first element with the elements next pointer. That can of course be updated while we are poping. This is handled by line 59 which makes sure that the new first element we are installing is the same one we read on line 57. There is never a concurrency issue with line 56 and line 37.
(Again, unless I am wrong.)
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for spotting the bugs in the program. I'm going to push an updated version that fixes that. It's still crashing in the same line -- but rarely now. It took 12700 runs before it crashed right now.
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What? It crashes. Sometimes after a couple of seconds of running (but how long it takes is irrelevant). That's nothing to be satisfied with IMO. I'm just letting the version loose on the original pony code.
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't know of any program that I've run 12700 times without a spurious crash! :)
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@TobiasWrigstad Mind Poe's law: http://en.wikipedia.org/wiki/Poe's_law
Can't tell if you're joking. Edit: we're not talking about programs here. We're talking about method calls. If a program that does a couple thousand method calls crashes after a couple thousand executions, we've got a problem.
Just confirmed that the updated
counter.c
crashes with the ORIGINAL pony code as well.0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am half-joking. Have you been able to reproduce the bug? Can we be certain it is the program's fault?
I've now run the program 30.000 times without seeing the bug. Etc.
The other half is the question how many times you can run a program without getting affected by some random crap from outside the program, etc.
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have been able to reproduce it with the original code (see last line of last post). If the program crashes, it is the program(mer)'s fault (who else?).
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The person who installed it?
The person who sold the faulty hardware?
The attacker in the other thread?
…
Let's drop this part of the discussion.
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But anyway, Sylvain seems to be fixing it. The frequency of crashes is low enough for us to proceed.
0c2dcf6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think it is the same bug? Since I haven't seen the crash, I don't know. Can you paste the printout/backtrace of the crash?