Index: llvm/lib/Target/X86/X86FrameLowering.cpp
--- llvm/lib/Target/X86/X86FrameLowering.cpp.orig
+++ llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -15,6 +15,7 @@
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86ReturnProtectorLowering.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
@@ -49,7 +50,7 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget 
                                    MaybeAlign StackAlignOverride)
     : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
                           STI.is64Bit() ? -8 : -4),
-      STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
+      STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()), RPL() {
   // Cache a bunch of frame-related predicates for this subtarget.
   SlotSize = TRI->getSlotSize();
   Is64Bit = STI.is64Bit();
@@ -57,6 +58,7 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget 
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
   StackPtr = TRI->getStackRegister();
+  SaveArgs = Is64Bit ? STI.getSaveArgs() : 0;
 }
 
 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
@@ -102,7 +104,8 @@ bool X86FrameLowering::hasFPImpl(const MachineFunction
           MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
           MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
           MFI.hasStackMap() || MFI.hasPatchPoint() ||
-          (isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment()));
+          (isWin64Prologue(MF) && MFI.hasCopyImplyingStackAdjustment()) ||
+          SaveArgs);
 }
 
 static unsigned getSUBriOpcode(bool IsLP64) {
@@ -1450,6 +1453,24 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasic
   }
 }
 
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+                                                const X86Subtarget &Subtarget) {
+  assert(Subtarget.is64Bit());
+
+  if (Subtarget.isCallingConvWin64(CallConv)) {
+    static const MCPhysReg GPR64ArgRegsWin64[] = {
+      X86::RCX, X86::RDX, X86::R8,  X86::R9
+    };
+    return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
+  }
+
+  static const MCPhysReg GPR64ArgRegs64Bit[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+  };
+  return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
+}
+
 bool X86FrameLowering::has128ByteRedZone(const MachineFunction &MF) const {
   // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
   // clobbered by any interrupt handler.
@@ -1881,6 +1902,43 @@ void X86FrameLowering::emitPrologue(MachineFunction &M
               .addReg(StackPtr)
               .setMIFlag(MachineInstr::FrameSetup);
 
+      if (SaveArgs && !Fn.arg_empty()) {
+        ArrayRef<MCPhysReg> GPRs =
+          get64BitArgumentGPRs(Fn.getCallingConv(), STI);
+        unsigned arg_size = Fn.arg_size();
+        unsigned RI = 0;
+        int64_t SaveSize = 0;
+
+        if (Fn.hasStructRetAttr()) {
+          GPRs = GPRs.drop_front(1);
+          arg_size--;
+        }
+
+        for (MCPhysReg Reg : GPRs) {
+          if (++RI > arg_size)
+            break;
+
+          SaveSize += SlotSize;
+
+          BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+            .addReg(Reg)
+            .setMIFlag(MachineInstr::FrameSetup);
+        }
+
+        // Realign the stack. PUSHes are the most space efficient.
+        while (SaveSize % getStackAlignment()) {
+          BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+            .addReg(GPRs.front())
+            .setMIFlag(MachineInstr::FrameSetup);
+
+          SaveSize += SlotSize;
+        }
+
+        //dlg StackSize -= SaveSize;
+        //dlg MFI.setStackSize(StackSize);
+        X86FI->setSaveArgSize(SaveSize);
+      }
+
         if (NeedsDwarfCFI) {
           if (ArgBaseReg.isValid()) {
             SmallString<64> CfaExpr;
@@ -2490,11 +2548,23 @@ void X86FrameLowering::emitEpilogue(MachineFunction &M
       emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
     }
     // Pop EBP.
-    BuildMI(MBB, MBBI, DL,
-            TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),
-            MachineFramePtr)
+//    BuildMI(MBB, MBBI, DL,
+//            TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),
+//            MachineFramePtr)
+//        .setMIFlag(MachineInstr::FrameDestroy);
+
+    if (X86FI->getSaveArgSize()) {
+      // LEAVE is effectively mov rbp,rsp; pop rbp
+      BuildMI(MBB, MBBI, DL, TII.get(X86::LEAVE64))
         .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      // Pop EBP.
+      BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+             MachineFramePtr)
+         .setMIFlag(MachineInstr::FrameDestroy);
+    }
 
+
     // We need to reset FP to its untagged state on return. Bit 60 is currently
     // used to show the presence of an extended frame.
     if (X86FI->hasSwiftAsyncContext()) {
@@ -2534,7 +2604,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &M
       if (!PI->getFlag(MachineInstr::FrameDestroy) ||
           (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&
            Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
-           Opc != X86::POP2P && Opc != X86::LEA64r))
+           Opc != X86::POP2P && Opc != X86::LEA64r && Opc != X86::LEAVE64))
         break;
       FirstCSPop = PI;
     }
@@ -2718,6 +2788,9 @@ StackOffset X86FrameLowering::getFrameIndexReference(c
            "FPDelta isn't aligned per the Win64 ABI!");
   }
 
+  if (FI >= 0)
+    Offset -= X86FI->getSaveArgSize();
+
   if (FrameReg == TRI->getFramePtr()) {
     // Skip saved EBP/RBP
     Offset += SlotSize;
@@ -4269,6 +4342,10 @@ void X86FrameLowering::adjustFrameForMsvcCxxEh(Machine
   addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
                     UnwindHelpFI)
       .addImm(-2);
+}
+
+const ReturnProtectorLowering *X86FrameLowering::getReturnProtector() const {
+  return &RPL;
 }
 
 void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced(
