shiftOut not very efficient

Here is my overloaded version of shiftOut you might like to try.
I also have the shiftIn version too if you are interested.

It uses fat16lib's fast output library you can get from the thread below
http://arduino.cc/forum/index.php/topic,84044.0.html

template< const uint8_t _DataPin, const uint8_t _ClockPin, const uint8_t _BitOrder >
    void shiftOut( uint8_t val )
      {
        FastDigitalIO< _DataPin > f_DataPin;
        FastDigitalIO< _ClockPin > f_ClockPin;
        
        for( uint8_t u_Index = 0 ; u_Index < 8 ; ++u_Index ){
          
          f_DataPin.write( ( _BitOrder == LSBFIRST ) ? ( !!( val & ( 1 << u_Index ) ) ) : ( !!( val & ( 1 << ( 7 - u_Index ) ) ) ) ); 		
          f_ClockPin.write( HIGH );
          f_ClockPin.write( LOW );	
        }
        return;
      }

Use like:

  FastDigitalIO< 10 > f_Storage;

  f_Storage.write( LOW );
  shiftOut< 9, 8, LSBFIRST >( B10101010 );
  f_Storage.write( HIGH );

The actual version I use is for a class that supports multiple shift registers of any size.

  template< const uint8_t _DataPin, const uint8_t _ClockPin, const uint8_t _BitOrder >
    void shiftOut( uint8_t val, uint16_t &bitsRemaining )
      {
        unsigned char u_BitsThisRun = ( bitsRemaining > 0x8 ) ? 0x8 : bitsRemaining;
        
        FastDigitalIO< _DataPin > f_DataPin;
        FastDigitalIO< _ClockPin > f_ClockPin;
        
        for( uint8_t u_Index = 0 ; u_Index < u_BitsThisRun ; ++u_Index ){
          
          f_DataPin.write( ( _BitOrder == LSBFIRST ) ? ( !!( val & ( 1 << u_Index ) ) ) : ( !!( val & ( 1 << ( 7 - u_Index ) ) ) ) ); 		
          f_ClockPin.write( HIGH );
          f_ClockPin.write( LOW );	
        }
        bitsRemaining -= u_BitsThisRun;
        return;
      }

Used like.

        uint16_t u_BitsRemaining = 20;
         FastDigitalIO< 10 > f_Storage;
       
        f_Storage.write( LOW );
        while( u_BitsRemaining ) shiftOut< 9, 8, LSBFIRST >( OUTPUT_DATA, u_BitsRemaining );
        f_Storage.write( HIGH );