about summary refs log tree commit diff
path: root/src/rt/arch/i386/morestack.S
blob: 3c4e95e3ac13c3ea184fe0dd3defe084ceaaac05 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
/*
	__morestack

	This function implements stack growth using the mechanism
	devised by Ian Lance Taylor for gccgo, described here:

	http://gcc.gnu.org/wiki/SplitStacks

	The Rust stack is composed of a linked list of stack segments,
	and each stack segment contains two parts: the work area,
	where Rust functions are allowed to execute; and the red zone,
	where no Rust code can execute, but where short runtime
	functions (including __morestack), the dynamic linker, signal
	handlers, and the unwinder can run.

	Each Rust function contains an LLVM-generated prologue that
	compares the stack space required for the current function to
	the space space remaining in the current stack segment,
	maintained in a platform-specific TLS slot.  The stack limit
	is strategically maintained by the Rust runtime so that it is
	always in place whenever a Rust function is running.

	When there is not enough room to run the function, the function
	prologue makes a call to __morestack to allocate a new stack
	segment, copy any stack-based arguments to it, switch stacks,
	then resume execution of the original function.

	-- The __morestack calling convention --

	For reasons of efficiency the __morestack calling convention
	is bizarre. The calling function does not attempt to align the
	stack for the call, and on x86_64 the arguments to __morestack
	are passed in scratch registers in order to preserve the
	original function's arguments.

	Once __morestack has switched to the new stack, instead of
	returning, it then calls into the original function, resuming
	execution at the instruction following the call to
	__morestack. Thus, when the original function returns it
	actually returns to __morestack, which then deallocates the
	stack and returns again to the original function's caller.

	-- Unwinding --

	All this trickery causes hell when it comes time for the
	unwinder to navigate it's way through this function. What
	will happen is the original function will be unwound first
	without any special effort, then the unwinder encounters
	the __morestack frame, which is sitting just above a
	tiny fraction of a frame (containing just a return pointer
	and, on 32-bit, the arguments to __morestack).

	We deal with this by claiming that that little bit of stack
	is actually part of the __morestack frame, encoded as
	DWARF call frame instructions (CFI) by .cfi assembler
	pseudo-ops.

	One final complication (that took me a week to figure out)
	is that OS X 10.6+ uses its own 'compact unwind info',
	an undocumented format generated by the linker from
	the DWARF CFI. This compact unwind info doesn't correctly
	capture the nuance of the __morestack frame, and as a
	result all of our linking on OS X uses the -no_compact_unwind
	flag.
*/

.text

#if defined(__APPLE__)
#define RUST_GET_TASK           L_rust_get_task$stub
#define UPCALL_CALL_C           L_upcall_call_shim_on_c_stack$stub
#define MORESTACK               ___morestack
#else
#if defined(__linux__)
#define UPCALL_NEW_STACK        upcall_new_stack
#define UPCALL_DEL_STACK        upcall_del_stack
#define RUST_GET_TASK           rust_get_task
#define UPCALL_CALL_C           upcall_call_shim_on_c_stack
#define MORESTACK               __morestack
#else
#define UPCALL_NEW_STACK        _upcall_new_stack
#define UPCALL_DEL_STACK        _upcall_del_stack
#define RUST_GET_TASK           _rust_get_task
#define UPCALL_CALL_C           _upcall_call_shim_on_c_stack
#define MORESTACK               ___morestack
#endif
#endif

.globl UPCALL_NEW_STACK
.globl UPCALL_DEL_STACK
#ifndef __APPLE__
.globl RUST_GET_TASK
.globl UPCALL_CALL_C_STACK
#endif
.globl MORESTACK

// FIXME: What about _WIN32?	
#if defined(__linux__)
	.hidden MORESTACK
#else
#if defined(__APPLE__)
	.private_extern MORESTACK
#endif
#endif

#ifdef __ELF__
	.type MORESTACK,@function
#endif

MORESTACK:
#if defined(__linux__) || defined(__APPLE__)
	.cfi_startproc
#endif

	// This base pointer setup differs from most in that we are
	// telling the unwinder to consider the Canonical Frame
	// Address (CFA) for this frame to be the value of the stack
	// pointer prior to entry to the original function, whereas
	// the CFA would typically be the the value of the stack
	// pointer prior to entry to this function. This will allow
	// the unwinder to understand how to skip the tiny partial
	// frame that the original function created by calling
	// __morestack.

	// In practical terms, our CFA is 12 bytes greater than it
	// would normally be, accounting for the two arguments to
	// __morestack, and an extra return address.

	pushl %ebp
#if defined(__linux__) || defined(__APPLE__)
	// The CFA is 20 bytes above the register that it is
	// associated with for this frame (which will be %ebp)
	.cfi_def_cfa_offset 20
	// %ebp is -20 bytes from the CFA
	.cfi_offset %ebp, -20
#endif
	movl %esp, %ebp
#if defined(__linux__) || defined(__APPLE__)
	// Calculate the CFA as an offset from %ebp
	.cfi_def_cfa_register %ebp
#endif

	// NB: This can be called with the fastcc convention so we
	// have to preserve any argument registers

	// NB: __morestack is called misaligned by 4 bytes, i.e.
	// subl $4, %esp would get us to a normal alignment

	subl $44,%esp

	// Save fastcc arguments
	movl %ecx, 28(%esp)
	movl %edx, 24(%esp)

	// FIXME (1226): main is compiled with the split-stack prologue,
	// causing it to call __morestack, so we have to jump back out
	calll RUST_GET_TASK
	testl %eax,%eax
	jz .L$bail

	// Save the the correct %esp value for our grandparent frame,
	// for the unwinder
	// FIXME: This isn't used
	leal 20(%ebp), %eax
	movl %eax, -4(%ebp)

	// The arguments to upcall_new_stack

	// The size of the stack arguments to copy to the new stack,
	// ane of the the arguments to __morestack
	movl 56(%esp),%eax
	movl %eax,20(%esp)
	// The address of the stack arguments to the original function
	leal 64(%esp),%eax
	movl %eax,16(%esp)
	// The amount of stack needed for the original function,
	// the other argument to __morestack
	movl 52(%esp),%eax // The amount of stack needed
	movl %eax,12(%esp)
	// Out pointer to the new stack
	movl $0, 8(%esp)

#ifdef __APPLE__
	call 1f
1:	popl %eax
	movl L_upcall_new_stack$non_lazy_ptr-1b(%eax),%eax
	movl %eax, 4(%esp)
#else
	movl $UPCALL_NEW_STACK,4(%esp)
#endif

	leal 8(%esp), %eax
	movl %eax,(%esp)
	call UPCALL_CALL_C

	// Grab the __morestack return pointer
	movl 48(%esp),%eax
	// Skip past the ret instruction in the parent fn
	inc  %eax

	// Restore the fastcc arguments to the original function
	movl 28(%esp), %ecx
	movl 24(%esp), %edx

        // Switch stacks
	movl 8(%esp),%esp
        // Re-enter the function that called us
	call *%eax

	// Now the function that called us has returned, so we need to
	// delete the old stack space

	// Switch back to the rust stack
	movl %ebp, %esp

	// Remember that __morestack is called misaligned so %ebp
	// is not aligned to a 16-byte boundary, these 4 bytes realign.
	subl $4, %esp

	// Now that we're on the return path we want to avoid
	// stomping on %eax. FIXME: Need to save and restore %eax to
	// actually preserve it across the call to delete the stack
#ifdef __APPLE__
	call 1f
1:	popl %ecx
	movl L_upcall_del_stack$non_lazy_ptr-1b(%ecx),%ecx
	pushl %ecx
#else
	pushl $UPCALL_DEL_STACK
#endif

	pushl $0
	call UPCALL_CALL_C

	addl $12,%esp

	popl %ebp

	// FIXME: I don't think these rules are necessary
	// since the unwinder should never encounter an instruction
	// pointer pointing here.
#if defined(__linux__) || defined(__APPLE__)
	// Restore the rule for how to find %ebp
	.cfi_restore %ebp
	// Tell the unwinder how to find the CFA in terms of %esp
	.cfi_def_cfa %esp, 16
#endif
	retl $8

.L$bail:
	movl 32(%esp),%eax
	inc %eax
	
	addl $44, %esp
	popl %ebp
	addl $4+8,%esp
	
	jmpl *%eax

#if defined(__linux__) || defined(__APPLE__)
	.cfi_endproc
#endif

#ifdef __APPLE__

	.section __IMPORT,__pointers,non_lazy_symbol_pointers
L_upcall_new_stack$non_lazy_ptr:
	.indirect_symbol _upcall_new_stack
	.long 0
L_upcall_del_stack$non_lazy_ptr:
	.indirect_symbol _upcall_del_stack
	.long 0

.section __IMPORT,__jump_table,symbol_stubs,pure_instructions+self_modifying_code,5

	// Linker will replace the hlts (the ascii) with jmp
L_rust_get_task$stub:
	.indirect_symbol _rust_get_task
	.ascii	 "\364\364\364\364\364"

L_upcall_call_shim_on_c_stack$stub:
	.indirect_symbol _upcall_call_shim_on_c_stack
	.ascii	 "\364\364\364\364\364"

	.subsections_via_symbols
#endif