Slow run Speed of optimized code in VS2005 vs VS2003 (Unmanaged C++)

C

Chris288

Hi,

I have a problem where our app when compiled in VS2005 runs about 50%
the speed it attains in VS2003. This is an unmanaged C++ app.

I have tried most combinations of the optimization and language settings
with little change in run speed.

I compared the generated native code in various places in the code and
noticed two things.
a) The compiler's conditions for inlining code appear to be more
rigorous, fewer functions get inlined.
b) The VS2005 compiler appears to generate a lot more native code than
the VS2003 compiler (The Size of the VS2005 Modules (DDL & EXE)) are
also all larger than their VS2003 equivalents.

I investigated one of the most heavily used methods in the two builds
and include assembler listings of the two below. My observations above
can be seen in these listings!

Can anyone please explain what is going wrong ?
Is there a fix or something that I can do to solve this problem?

Regards,
Chris

VS2003
===============================================================================

//
--------------------------------------------------------------------------

double CSpecieDataBase::msHf(long Fidelity, PhMask Phase, double T_,
double P_, SpPropOveride *Ovr, double *M, double *pTotalM)
{
00C18EB0 push ebp
00C18EB1 mov ebp,esp
00C18EB3 push 0FFFFFFFFh
00C18EB5 push offset
__ehhandler$?msHf@CSpecieDataBase@@QAENJKNNPAVSpPropOveride@@PAN1@Z
(0C6B57Ah)
00C18EBA mov eax,dword ptr fs:[00000000h]
00C18EC0 push eax
00C18EC1 mov dword ptr fs:[0],esp
00C18EC8 sub esp,28h
double Et=0.0, Mt=0.0;
SpecieIter I(Phase);
00C18ECB mov eax,dword ptr [CDB (0D00F5Ch)]
00C18ED0 fld qword ptr [__real@0000000000000000 (0C76C70h)]
00C18ED6 mov eax,dword ptr [eax+4158h]
00C18EDC fstp qword ptr [ebp-1Ch]
00C18EDF mov edx,dword ptr [ebp+0Ch]
00C18EE2 fld qword ptr [__real@0000000000000000 (0C76C70h)]
00C18EE8 and eax,edx
00C18EEA fstp qword ptr [ebp-14h]
00C18EED push esi
00C18EEE push eax
00C18EEF mov ecx,offset SVI (0D01118h)
00C18EF4 mov dword ptr [ebp-20h],eax
00C18EF7 call CSysVecInfo::SetUpSkipList (0C17A40h)
00C18EFC mov ecx,eax
00C18EFE mov dword ptr ,ecx
00C18F01 mov edx,dword ptr [SVI (0D01118h)]
00C18F07 xor eax,eax
00C18F09 mov dword ptr [ebp-2Ch],edx
00C18F0C mov esi,dword ptr [SVI+10h (0D01128h)]
00C18F12 mov dword ptr [ebp-30h],eax
00C18F15 mov byte ptr [ebp-24h],al
00C18F18 mov byte ptr [ebp-23h],al
00C18F1B mov dword ptr [ebp-28h],esi
00C18F1E mov dword ptr [ebp-4],eax
for (int i=-1; I.Loop(i); )
00C18F21 mov esi,dword ptr [ecx]
00C18F23 cmp esi,edx
00C18F25 jge CSpecieDataBase::msHf+0E9h (0C18F99h)
00C18F27 push ebx
00C18F28 mov ebx,dword ptr [M]
00C18F2B push edi
00C18F2C lea esp,[esp]
{
if ((M > MeasTolerance) && SDB.CpDirect())
00C18F30 fld qword ptr [ebx+esi*8]
00C18F33 fcomp qword ptr [MeasTolerance (0CEF480h)]
00C18F39 fnstsw ax
00C18F3B test ah,41h
00C18F3E jne CSpecieDataBase::msHf+0DCh (0C18F8Ch)
00C18F40 mov eax,dword ptr [SDB (0D00F64h)]
00C18F45 mov edi,dword ptr [eax+esi*4]
00C18F48 test byte ptr [edi+824h],10h
00C18F4F jne CSpecieDataBase::msHf+0DCh (0C18F8Ch)
{
Et+=M*SDB.msHf(Fidelity, T_, P_, Ovr, M);
00C18F51 mov ecx,dword ptr [Ovr]
00C18F54 fld qword ptr [P_]
00C18F57 mov edx,dword ptr [Fidelity]
00C18F5A push ebx
00C18F5B push ecx
00C18F5C sub esp,10h
00C18F5F fstp qword ptr [esp+8]
00C18F63 mov ecx,edi
00C18F65 fld qword ptr [T_]
00C18F68 fstp qword ptr [esp]
00C18F6B push edx
00C18F6C call CSpecie::mlHf (0C0D760h)
00C18F71 fdiv qword ptr [edi+8]
Mt+=M;
00C18F74 mov edx,dword ptr [ebp-2Ch]
00C18F77 mov ecx,dword ptr
00C18F7A fmul qword ptr [ebx+esi*8]
00C18F7D fadd qword ptr [Et]
00C18F80 fstp qword ptr [Et]
00C18F83 fld qword ptr [Mt]
00C18F86 fadd qword ptr [ebx+esi*8]
00C18F89 fstp qword ptr [Mt]
00C18F8C mov edi,dword ptr [ecx+esi*4+4]
00C18F90 inc esi
00C18F91 add esi,edi
00C18F93 cmp esi,edx
00C18F95 jl CSpecieDataBase::msHf+80h (0C18F30h)
00C18F97 pop edi
00C18F98 pop ebx
}
}
if (pTotalM)
00C18F99 mov eax,dword ptr [pTotalM]
00C18F9C test eax,eax
00C18F9E pop esi
00C18F9F je CSpecieDataBase::msHf+0F6h (0C18FA6h)
*pTotalM=Mt;
00C18FA1 fld qword ptr [Mt]
00C18FA4 fstp qword ptr [eax]
return Et/GTZ(Mt);
00C18FA6 fld qword ptr [Mt]
00C18FA9 mov ecx,dword ptr [__imp_ZeroLimit (0C76098h)]
00C18FAF fcomp qword ptr [ecx]
00C18FB1 fnstsw ax
00C18FB3 test ah,41h
00C18FB6 jne CSpecieDataBase::msHf+10Dh (0C18FBDh)
00C18FB8 fld qword ptr [Mt]
00C18FBB jmp CSpecieDataBase::msHf+10Fh (0C18FBFh)
00C18FBD fld qword ptr [ecx]
00C18FBF fld qword ptr [Et]
00C18FC2 fdiv st,st(1)
00C18FC4 fstp qword ptr [P_]
00C18FC7 fstp st(0)
00C18FC9 mov dword ptr [ebp-4],0FFFFFFFFh
00C18FD0 lea ecx,
00C18FD3 call SpecieIter::~SpecieIter (0C04680h)
00C18FD8 fld qword ptr [P_]
}
00C18FDB mov ecx,dword ptr [ebp-0Ch]
00C18FDE mov dword ptr fs:[0],ecx
00C18FE5 mov esp,ebp
00C18FE7 pop ebp
00C18FE8 ret 24h
--- No source file
-------------------------------------------------------------
00C18FEB int 3
00C18FEC int 3
00C18FED int 3
00C18FEE int 3
00C18FEF int 3


VS2005
===============================================================================

//
--------------------------------------------------------------------------

double CSpecieDataBase::msHf(long Fidelity, PhMask Phase, double T_,
double P_, SpPropOveride *Ovr, double *M, double *pTotalM)
{
00E32E30 push ebp
00E32E31 mov ebp,esp
00E32E33 push 0FFFFFFFFh
00E32E35 push offset
__ehhandler$?msHf@CSpecieDataBase@@QAENJKNNPAVSpPropOveride@@PAN1@Z
(0EB160Dh)
00E32E3A mov eax,dword ptr fs:[00000000h]
00E32E40 push eax
00E32E41 mov dword ptr fs:[0],esp
00E32E48 sub esp,54h
00E32E4B mov dword ptr [ebp-60h],ecx
double Et=0.0, Mt=0.0;
00E32E4E mov dword ptr [Et],0
00E32E55 mov dword ptr [ebp-18h],0
00E32E5C mov dword ptr [Mt],0
00E32E63 mov dword ptr [ebp-10h],0
SpecieIter I(Phase);
00E32E6A mov eax,dword ptr [Phase]
00E32E6D push eax
00E32E6E lea ecx,
00E32E71 call SpecieIter::SpecieIter (0E26610h)
00E32E76 mov dword ptr [ebp-4],0
for (int i=-1; I.Loop(i); )
00E32E7D mov dword ptr ,0FFFFFFFFh
00E32E84 mov ecx,dword ptr
00E32E87 add ecx,1
00E32E8A mov dword ptr ,ecx
00E32E8D mov edx,dword ptr
00E32E90 mov eax,dword ptr
00E32E93 mov ecx,dword ptr
00E32E96 add ecx,dword ptr [eax+edx*4]
00E32E99 mov dword ptr ,ecx
00E32E9C mov edx,dword ptr
00E32E9F cmp edx,dword ptr [ebp-2Ch]
00E32EA2 setl al
00E32EA5 movzx ecx,al
00E32EA8 test ecx,ecx
00E32EAA je CSpecieDataBase::msHf+139h (0E32F69h)
{
if ((M > MeasTolerance) && SDB.CpDirect())
00E32EB0 mov edx,dword ptr
00E32EB3 mov eax,dword ptr [M]
00E32EB6 fld qword ptr [MeasTolerance (0F3EBD8h)]
00E32EBC fcomp qword ptr [eax+edx*8]
00E32EBF fnstsw ax
00E32EC1 test ah,5
00E32EC4 jp CSpecieDataBase::msHf+134h (0E32F64h)
00E32ECA mov ecx,dword ptr
00E32ECD mov edx,dword ptr [SDB (0F501ECh)]
00E32ED3 mov eax,dword ptr [edx+ecx*4]
00E32ED6 mov dword ptr [ebp-48h],eax
00E32ED9 mov ecx,dword ptr [ebp-48h]
00E32EDC mov dl,byte ptr [ecx+824h]
00E32EE2 shr dl,4
00E32EE5 and dl,1
00E32EE8 movzx eax,dl
00E32EEB test eax,eax
00E32EED sete cl
00E32EF0 movzx edx,cl
00E32EF3 test edx,edx
00E32EF5 je CSpecieDataBase::msHf+134h (0E32F64h)
{
Et+=M*SDB.msHf(Fidelity, T_, P_, Ovr, M);
00E32EF7 mov eax,dword ptr
00E32EFA mov ecx,dword ptr [SDB (0F501ECh)]
00E32F00 mov edx,dword ptr [ecx+eax*4]
00E32F03 mov dword ptr [ebp-4Ch],edx
00E32F06 mov eax,dword ptr [M]
00E32F09 push eax
00E32F0A mov ecx,dword ptr [Ovr]
00E32F0D push ecx
00E32F0E sub esp,8
00E32F11 mov edx,dword ptr [P_]
00E32F14 mov eax,dword ptr [ebp+1Ch]
00E32F17 mov dword ptr [esp],edx
00E32F1A mov dword ptr [esp+4],eax
00E32F1E sub esp,8
00E32F21 mov ecx,dword ptr [T_]
00E32F24 mov edx,dword ptr [ebp+14h]
00E32F27 mov dword ptr [esp],ecx
00E32F2A mov dword ptr [esp+4],edx
00E32F2E mov eax,dword ptr [Fidelity]
00E32F31 push eax
00E32F32 mov ecx,dword ptr [ebp-4Ch]
00E32F35 call CSpecie::mlHf (0E2F7D0h)
00E32F3A mov ecx,dword ptr [ebp-4Ch]
00E32F3D fdiv qword ptr [ecx+8]
00E32F40 fstp qword ptr [ebp-54h]
00E32F43 mov edx,dword ptr
00E32F46 mov eax,dword ptr [M]
00E32F49 fld qword ptr [eax+edx*8]
00E32F4C fmul qword ptr [ebp-54h]
00E32F4F fadd qword ptr [Et]
00E32F52 fstp qword ptr [Et]
Mt+=M;
00E32F55 mov ecx,dword ptr
00E32F58 mov edx,dword ptr [M]
00E32F5B fld qword ptr [Mt]
00E32F5E fadd qword ptr [edx+ecx*8]
00E32F61 fstp qword ptr [Mt]
}
}
00E32F64 jmp CSpecieDataBase::msHf+54h (0E32E84h)
if (pTotalM)
00E32F69 cmp dword ptr [pTotalM],0
00E32F6D je CSpecieDataBase::msHf+14Dh (0E32F7Dh)
*pTotalM=Mt;
00E32F6F mov eax,dword ptr [pTotalM]
00E32F72 mov ecx,dword ptr [Mt]
00E32F75 mov edx,dword ptr [ebp-10h]
00E32F78 mov dword ptr [eax],ecx
00E32F7A mov dword ptr [eax+4],edx
return Et/GTZ(Mt);
00E32F7D sub esp,8
00E32F80 mov eax,dword ptr [Mt]
00E32F83 mov ecx,dword ptr [ebp-10h]
00E32F86 mov dword ptr [esp],eax
00E32F89 mov dword ptr [esp+4],ecx
00E32F8D call GTZ<double> (0D19EA0h)
00E32F92 add esp,8
00E32F95 fdivr qword ptr [Et]
00E32F98 fstp qword ptr [ebp-44h]
00E32F9B mov dword ptr [ebp-4],0FFFFFFFFh
00E32FA2 lea ecx,
00E32FA5 call SpecieIter::~SpecieIter (0E26680h)
00E32FAA fld qword ptr [ebp-44h]
}
00E32FAD mov ecx,dword ptr [ebp-0Ch]
00E32FB0 mov dword ptr fs:[0],ecx
00E32FB7 mov esp,ebp
00E32FB9 pop ebp
00E32FBA ret 24h
--- No source file
 
N

Nathan Mates

I have a problem where our app when compiled in VS2005 runs about 50%
the speed it attains in VS2003. This is an unmanaged C++ app.
I have tried most combinations of the optimization and language settings
with little change in run speed.

I don't know if you've tried these settings in particular, but I'd
recommend trying them:

1) Right click on your project, select Properties. Under Configuration
Properties -> General, set Whole Program Optimization to 'Use Link
Time Code Generation'. This'll probably make your link times painful
if your exe is over a megabyte or so, but it's generally quite worth
it. I usually make separate build configs, one for 'Release' and one
for 'ReleaseLTCG'

2) Try setting Configuration Properties -> C/C++ -> Code Generation ->
Buffer Security Check to No. On the same page, turn off exceptions if
your code doesn't use them. [I also turn on enable string pooling, and
turn OFF minimal rebuild on that page, but that tends not to change
speed.]

3) Your code seems to be doing a lot of floating point work. Have you
tried setting C/C++ -> Code Generation -> Enable Enhanced Instruction
Set to SSE or SSE2? See
http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions and
http://en.wikipedia.org/wiki/SSE2 for lists of what processor(s)
support those. Frankly, by now, just about every consumer box out
there should support SSE, as the Pentium III and Athlon XP added it.
If you know what platforms you're running on in more detail, SSE2 may
be worth it.

4) If you have migrated your projects from 2003 -> 2005, then the
upgrade/migration "wizard" typically botches the optimization
settings, setting them to "custom" or somesuch. Check under C/C++, and
set Optimization to Minimize Size, favor small code, and do omit frame
pointers. That's what's recommended by MS for all but the most
critical loops. If you're generating 'fast' code, I've seen it unroll
*HUGE* loops in 1-time init code, which I have a gut feeling would
blow out the L1/L2 cache.

Nathan Mates
 
C

Chris288

Nathan,

Thanks for the reply.
I will try these suggestions and come back with the results!

Chris Moreton


Nathan said:
I have a problem where our app when compiled in VS2005 runs about 50%
the speed it attains in VS2003. This is an unmanaged C++ app.
I have tried most combinations of the optimization and language settings
with little change in run speed.

I don't know if you've tried these settings in particular, but I'd
recommend trying them:

1) Right click on your project, select Properties. Under Configuration
Properties -> General, set Whole Program Optimization to 'Use Link
Time Code Generation'. This'll probably make your link times painful
if your exe is over a megabyte or so, but it's generally quite worth
it. I usually make separate build configs, one for 'Release' and one
for 'ReleaseLTCG'

2) Try setting Configuration Properties -> C/C++ -> Code Generation ->
Buffer Security Check to No. On the same page, turn off exceptions if
your code doesn't use them. [I also turn on enable string pooling, and
turn OFF minimal rebuild on that page, but that tends not to change
speed.]

3) Your code seems to be doing a lot of floating point work. Have you
tried setting C/C++ -> Code Generation -> Enable Enhanced Instruction
Set to SSE or SSE2? See
http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions and
http://en.wikipedia.org/wiki/SSE2 for lists of what processor(s)
support those. Frankly, by now, just about every consumer box out
there should support SSE, as the Pentium III and Athlon XP added it.
If you know what platforms you're running on in more detail, SSE2 may
be worth it.

4) If you have migrated your projects from 2003 -> 2005, then the
upgrade/migration "wizard" typically botches the optimization
settings, setting them to "custom" or somesuch. Check under C/C++, and
set Optimization to Minimize Size, favor small code, and do omit frame
pointers. That's what's recommended by MS for all but the most
critical loops. If you're generating 'fast' code, I've seen it unroll
*HUGE* loops in 1-time init code, which I have a gut feeling would
blow out the L1/L2 cache.

Nathan Mates


--
<*> Nathan Mates - personal webpage http://www.visi.com/~nathan/
# Programmer at Pandemic Studios -- http://www.pandemicstudios.com/
# NOT speaking for Pandemic Studios. "Care not what the neighbors
# think. What are the facts, and to how many decimal places?" -R.A. Heinlein
 
J

Jeffrey Tan[MSFT]

Hi Chris,

Have you tried Nathan's suggestions? Do they make sense to you? If you
still need any help, please feel free to feedback, thanks.

Best regards,
Jeffrey Tan
Microsoft Online Community Support
==================================================
Get notification to my posts through email? Please refer to
http://msdn.microsoft.com/subscriptions/managednewsgroups/default.aspx#notif
ications.

Note: The MSDN Managed Newsgroup support offering is for non-urgent issues
where an initial response from the community or a Microsoft Support
Engineer within 1 business day is acceptable. Please note that each follow
up response may take approximately 2 business days as the support
professional working with you may need further investigation to reach the
most efficient resolution. The offering is not appropriate for situations
that require urgent, real-time or phone-based interactions or complex
project analysis and dump analysis issues. Issues of this nature are best
handled working with a dedicated Microsoft Support Engineer by contacting
Microsoft Customer Support Services (CSS) at
http://msdn.microsoft.com/subscriptions/support/default.aspx.
==================================================
This posting is provided "AS IS" with no warranties, and confers no rights.
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Top