// Copyright (c) 2014 Dropbox, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "codegen/irgen.h"

#include <cstdio>
#include <iostream>
#include <sstream>
#include <stdint.h>

#include "llvm/Analysis/Passes.h"
#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/PassManager.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Instrumentation.h"
#include "llvm/Transforms/Scalar.h"

#include "analysis/function_analysis.h"
#include "analysis/scoping_analysis.h"
#include "analysis/type_analysis.h"
#include "codegen/codegen.h"
#include "codegen/compvars.h"
#include "codegen/gcbuilder.h"
#include "codegen/irgen/irgenerator.h"
#include "codegen/irgen/util.h"
#include "codegen/opt/escape_analysis.h"
#include "codegen/opt/inliner.h"
#include "codegen/opt/passes.h"
#include "codegen/osrentry.h"
#include "codegen/patchpoints.h"
#include "codegen/stackmaps.h"
#include "core/ast.h"
#include "core/cfg.h"
#include "core/options.h"
#include "core/stats.h"
#include "core/util.h"
#include "runtime/objmodel.h"
#include "runtime/types.h"

namespace pyston {

typedef std::unordered_set<CFGBlock*> BlockSet;

// This is where you can add a hook for any instruction added through the IRBuilder.
// It's currently not doing any hooking; hopefully there's not too much overhead from this.
void MyInserter::InsertHelper(llvm::Instruction* I, const llvm::Twine& Name, llvm::BasicBlock* BB,
                              llvm::BasicBlock::iterator InsertPt) const {
    llvm::IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
}

static void addIRDebugSymbols(llvm::Function* f) {
    llvm::legacy::PassManager mpm;

    llvm::error_code code = llvm::sys::fs::create_directory(".debug_ir", true);
    assert(!code);

    mpm.add(llvm::createDebugIRPass(false, false, ".debug_ir", f->getName()));

    mpm.run(*g.cur_module);
}

static void optimizeIR(llvm::Function* f, EffortLevel::EffortLevel effort) {
    // TODO maybe should do some simple passes (ex: gvn?) if effort level isn't maximal?
    // In general, this function needs a lot of tuning.
    if (effort < EffortLevel::MAXIMAL)
        return;

    Timer _t("optimizing");

    llvm::FunctionPassManager fpm(g.cur_module);

    // TODO: using this as a pass is a legacy cludge that shouldn't be necessary any more; can it be updated?
    fpm.add(new llvm::DataLayoutPass(*g.tm->getDataLayout()));

    if (ENABLE_INLINING && effort >= EffortLevel::MAXIMAL)
        fpm.add(makeFPInliner(275));
    fpm.add(llvm::createCFGSimplificationPass());

    fpm.add(llvm::createBasicAliasAnalysisPass());
    fpm.add(llvm::createTypeBasedAliasAnalysisPass());
    if (ENABLE_PYSTON_PASSES) {
        fpm.add(new EscapeAnalysis());
        fpm.add(createPystonAAPass());
    }

    if (ENABLE_PYSTON_PASSES)
        fpm.add(createMallocsNonNullPass());

    // TODO Find the right place for this pass (and ideally not duplicate it)
    if (ENABLE_PYSTON_PASSES) {
        fpm.add(llvm::createGVNPass());
        fpm.add(createConstClassesPass());
    }

    // TODO: find the right set of passes
    if (0) {
        // My original set of passes, that seem to get about 90% of the benefit:
        fpm.add(llvm::createInstructionCombiningPass());
        fpm.add(llvm::createReassociatePass());
        fpm.add(llvm::createGVNPass());
        fpm.add(llvm::createCFGSimplificationPass());
    } else {
        // copied + slightly modified from llvm/lib/Transforms/IPO/PassManagerBuilder.cpp::populateModulePassManager
        fpm.add(llvm::createEarlyCSEPass());                   // Catch trivial redundancies
        fpm.add(llvm::createJumpThreadingPass());              // Thread jumps.
        fpm.add(llvm::createCorrelatedValuePropagationPass()); // Propagate conditionals
        fpm.add(llvm::createCFGSimplificationPass());          // Merge & remove BBs
        fpm.add(llvm::createInstructionCombiningPass());       // Combine silly seq's

        fpm.add(llvm::createTailCallEliminationPass()); // Eliminate tail calls
        fpm.add(llvm::createCFGSimplificationPass());   // Merge & remove BBs
        fpm.add(llvm::createReassociatePass());         // Reassociate expressions
        fpm.add(llvm::createLoopRotatePass());          // Rotate Loop
        fpm.add(llvm::createLICMPass());                // Hoist loop invariants
        fpm.add(llvm::createLoopUnswitchPass(true /*optimize_for_size*/));
        fpm.add(llvm::createInstructionCombiningPass());
        fpm.add(llvm::createIndVarSimplifyPass()); // Canonicalize indvars
        fpm.add(llvm::createLoopIdiomPass());      // Recognize idioms like memset.
        fpm.add(llvm::createLoopDeletionPass());   // Delete dead loops

        fpm.add(llvm::createLoopUnrollPass()); // Unroll small loops

        fpm.add(llvm::createGVNPass());       // Remove redundancies
        fpm.add(llvm::createMemCpyOptPass()); // Remove memcpy / form memset
        fpm.add(llvm::createSCCPPass());      // Constant prop with SCCP

        // Run instcombine after redundancy elimination to exploit opportunities
        // opened up by them.
        fpm.add(llvm::createInstructionCombiningPass());
        fpm.add(llvm::createJumpThreadingPass()); // Thread jumps
        fpm.add(llvm::createCorrelatedValuePropagationPass());
        fpm.add(llvm::createDeadStoreEliminationPass()); // Delete dead stores

        fpm.add(llvm::createLoopRerollPass());
        // fpm.add(llvm::createSLPVectorizerPass());   // Vectorize parallel scalar chains.


        fpm.add(llvm::createAggressiveDCEPass());        // Delete dead instructions
        fpm.add(llvm::createCFGSimplificationPass());    // Merge & remove BBs
        fpm.add(llvm::createInstructionCombiningPass()); // Clean up after everything.

        // fpm.add(llvm::createBarrierNoopPass());
        // fpm.add(llvm::createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
        fpm.add(llvm::createInstructionCombiningPass());
        fpm.add(llvm::createCFGSimplificationPass());
    }

    // TODO Find the right place for this pass (and ideally not duplicate it)
    if (ENABLE_PYSTON_PASSES) {
        fpm.add(createConstClassesPass());
        fpm.add(llvm::createInstructionCombiningPass());
        fpm.add(llvm::createCFGSimplificationPass());
        fpm.add(createConstClassesPass());
        fpm.add(createDeadAllocsPass());
        // fpm.add(llvm::createSCCPPass());                  // Constant prop with SCCP
        // fpm.add(llvm::createEarlyCSEPass());              // Catch trivial redundancies
        // fpm.add(llvm::createInstructionCombiningPass());
        // fpm.add(llvm::createCFGSimplificationPass());
    }

    fpm.doInitialization();

    for (int i = 0; i < MAX_OPT_ITERATIONS; i++) {
        bool changed = fpm.run(*f);

        if (!changed) {
            if (VERBOSITY("irgen"))
                printf("done after %d optimization iterations\n", i - 1);
            break;
        }

        if (VERBOSITY("irgen") >= 1) {
            fprintf(stderr, "after optimization %d:\n", i);
            printf("\033[36m");
            fflush(stdout);
            dumpPrettyIR(f);
            // f->dump();
            // g.cur_module->dump();
            printf("\033[0m");
            fflush(stdout);
        }
    }

    long us = _t.end();
    static StatCounter us_optimizing("us_compiling_optimizing");
    us_optimizing.log(us);
}

static bool compareBlockPairs(const std::pair<CFGBlock*, CFGBlock*>& p1, const std::pair<CFGBlock*, CFGBlock*>& p2) {
    return p1.first->idx < p2.first->idx;
}

static std::vector<std::pair<CFGBlock*, CFGBlock*> >
computeBlockTraversalOrder(const BlockSet& full_blocks, const BlockSet& partial_blocks, CFGBlock* start) {

    std::vector<std::pair<CFGBlock*, CFGBlock*> > rtn;
    std::unordered_set<CFGBlock*> in_queue;

    if (start) {
        assert(full_blocks.count(start));
        in_queue.insert(start);
        rtn.push_back(std::make_pair(start, (CFGBlock*)NULL));
    }

    for (CFGBlock* b : partial_blocks) {
        in_queue.insert(b);
        rtn.push_back(std::make_pair(b, (CFGBlock*)NULL));
    }

    // It's important for debugging purposes that the order is deterministic, but the iteration
    // over the BlockSet is not:
    std::sort(rtn.begin(), rtn.end(), compareBlockPairs);

    int idx = 0;
    while (rtn.size() < full_blocks.size() + partial_blocks.size()) {
        // TODO: come up with an alternative algorithm that outputs
        // the blocks in "as close to in-order as possible".
        // Do this by iterating over all blocks and picking the smallest one
        // that has a predecessor in the list already.
        while (idx < rtn.size()) {
            CFGBlock* cur = rtn[idx].first;

            for (int i = 0; i < cur->successors.size(); i++) {
                CFGBlock* b = cur->successors[i];
                assert(full_blocks.count(b) || partial_blocks.count(b));
                if (in_queue.count(b))
                    continue;

                rtn.push_back(std::make_pair(b, cur));
                in_queue.insert(b);
            }

            idx++;
        }

        if (rtn.size() == full_blocks.size() + partial_blocks.size())
            break;

        CFGBlock* best = NULL;
        for (CFGBlock* b : full_blocks) {
            if (in_queue.count(b))
                continue;

            // Avoid picking any blocks where we can't add an epilogue to the predecessors
            if (b->predecessors.size() == 1 && b->predecessors[0]->successors.size() > 1)
                continue;

            if (best == NULL || b->idx < best->idx)
                best = b;
        }
        assert(best != NULL);

        if (VERBOSITY("irgen") >= 1)
            printf("Giving up and adding block %d to the order\n", best->idx);
        in_queue.insert(best);
        rtn.push_back(std::make_pair(best, (CFGBlock*)NULL));
    }

    ASSERT(rtn.size() == full_blocks.size() + partial_blocks.size(), "%ld\n", rtn.size());
    return rtn;
}

static void emitBBs(IRGenState* irstate, const char* bb_type, GuardList& out_guards, const GuardList& in_guards,
                    TypeAnalysis* types, const std::vector<AST_expr*>& arg_names,
                    const OSREntryDescriptor* entry_descriptor, const BlockSet& full_blocks,
                    const BlockSet& partial_blocks) {
    SourceInfo* source = irstate->getSourceInfo();
    EffortLevel::EffortLevel effort = irstate->getEffortLevel();
    CompiledFunction* cf = irstate->getCurFunction();
    ConcreteCompilerType* rtn_type = irstate->getReturnType();
    // llvm::MDNode* func_info = irstate->getFuncDbgInfo();

    if (entry_descriptor != NULL)
        assert(full_blocks.count(source->cfg->getStartingBlock()) == 0);

    // We need the entry blocks pre-allocated so that we can jump forward to them.
    std::unordered_map<CFGBlock*, llvm::BasicBlock*> llvm_entry_blocks;
    for (CFGBlock* block : source->cfg->blocks) {
        if (partial_blocks.count(block) == 0 && full_blocks.count(block) == 0) {
            llvm_entry_blocks[block] = NULL;
            continue;
        }

        char buf[40];
        snprintf(buf, 40, "%s_block%d", bb_type, block->idx);
        llvm_entry_blocks[block] = llvm::BasicBlock::Create(g.context, buf, irstate->getLLVMFunction());
    }

    llvm::BasicBlock* osr_entry_block = NULL; // the function entry block, where we add the type guards
    llvm::BasicBlock* osr_unbox_block = NULL; // the block after type guards where we up/down-convert things
    ConcreteSymbolTable* osr_syms = NULL;     // syms after conversion
    if (entry_descriptor != NULL) {
        osr_unbox_block = llvm::BasicBlock::Create(g.context, "osr_unbox", irstate->getLLVMFunction(),
                                                   &irstate->getLLVMFunction()->getEntryBlock());
        osr_entry_block = llvm::BasicBlock::Create(g.context, "osr_entry", irstate->getLLVMFunction(),
                                                   &irstate->getLLVMFunction()->getEntryBlock());
        assert(&irstate->getLLVMFunction()->getEntryBlock() == osr_entry_block);

        osr_syms = new ConcreteSymbolTable();
        SymbolTable* initial_syms = new SymbolTable();
        // llvm::BranchInst::Create(llvm_entry_blocks[entry_descriptor->backedge->target->idx], entry_block);

        llvm::BasicBlock* osr_entry_block_end = osr_entry_block;
        llvm::BasicBlock* osr_unbox_block_end = osr_unbox_block;
        std::unique_ptr<IREmitter> entry_emitter(createIREmitter(irstate, osr_entry_block_end));
        std::unique_ptr<IREmitter> unbox_emitter(createIREmitter(irstate, osr_unbox_block_end));

        CFGBlock* target_block = entry_descriptor->backedge->target;

        // Currently we AND all the type guards together and then do just a single jump;
        // guard_val is the current AND'd value, or NULL if there weren't any guards
        llvm::Value* guard_val = NULL;

        std::vector<llvm::Value*> func_args;
        for (llvm::Function::arg_iterator AI = irstate->getLLVMFunction()->arg_begin();
             AI != irstate->getLLVMFunction()->arg_end(); AI++) {
            func_args.push_back(AI);
        }

        // Handle loading symbols from the passed osr arguments:
        int arg_num = -1;
        for (const auto& p : entry_descriptor->args) {
            llvm::Value* from_arg;
            arg_num++;
            if (arg_num < 3) {
                from_arg = func_args[arg_num];
            } else {
                ASSERT(func_args.size() == 4, "%ld", func_args.size());
                llvm::Value* ptr = entry_emitter->getBuilder()->CreateConstGEP1_32(func_args[3], arg_num - 3);
                if (p.second == INT) {
                    ptr = entry_emitter->getBuilder()->CreateBitCast(ptr, g.i64->getPointerTo());
                } else if (p.second == FLOAT) {
                    ptr = entry_emitter->getBuilder()->CreateBitCast(ptr, g.double_->getPointerTo());
                }
                from_arg = entry_emitter->getBuilder()->CreateLoad(ptr);
                assert(from_arg->getType() == p.second->llvmType());
            }

            ConcreteCompilerType* phi_type;
            if (startswith(p.first, "!is_defined"))
                phi_type = BOOL;
            else
                phi_type = types->getTypeAtBlockStart(p.first, target_block);
            // ConcreteCompilerType *analyzed_type = types->getTypeAtBlockStart(p.first, block);
            // ConcreteCompilerType *phi_type = (*phis)[p.first].first;

            ConcreteCompilerVariable* var = new ConcreteCompilerVariable(p.second, from_arg, true);
            (*initial_syms)[p.first] = var;

            // It's possible to OSR into a version of the function with a higher speculation level;
            // this means that the types of the OSR variables are potentially higher (more unspecialized)
            // than what the optimized code expects.
            // So, we have to re-check the speculations and potentially deopt.
            llvm::Value* v = NULL;
            if (p.second == phi_type) {
                // good to go
                v = from_arg;
            } else if (p.second->canConvertTo(phi_type)) {
                // not sure if/when this happens, but if there's a type mismatch but one we know
                // can be handled (such as casting from a subclass to a superclass), handle it:
                ConcreteCompilerVariable* converted = var->makeConverted(*unbox_emitter, phi_type);
                v = converted->getValue();
                delete converted;
            } else {
                ASSERT(p.second == UNKNOWN, "%s", p.second->debugName().c_str());
                BoxedClass* speculated_class = NULL;
                if (phi_type == INT) {
                    speculated_class = int_cls;
                } else if (phi_type == FLOAT) {
                    speculated_class = float_cls;
                } else {
                    speculated_class = phi_type->guaranteedClass();
                }
                ASSERT(speculated_class, "%s", phi_type->debugName().c_str());

                llvm::Value* type_check = ConcreteCompilerVariable(p.second, from_arg, true)
                                              .makeClassCheck(*entry_emitter, speculated_class);
                if (guard_val) {
                    guard_val = entry_emitter->getBuilder()->CreateAnd(guard_val, type_check);
                } else {
                    guard_val = type_check;
                }
                // entry_emitter->getBuilder()->CreateCall(g.funcs.my_assert, type_check);

                if (speculated_class == int_cls) {
                    v = unbox_emitter->getBuilder()->CreateCall(g.funcs.unboxInt, from_arg);
                    (new ConcreteCompilerVariable(BOXED_INT, from_arg, true))->decvref(*unbox_emitter);
                } else if (speculated_class == float_cls) {
                    v = unbox_emitter->getBuilder()->CreateCall(g.funcs.unboxFloat, from_arg);
                    (new ConcreteCompilerVariable(BOXED_FLOAT, from_arg, true))->decvref(*unbox_emitter);
                } else {
                    assert(phi_type == typeFromClass(speculated_class));
                    v = from_arg;
                }
            }

            if (VERBOSITY("irgen"))
                v->setName("prev_" + p.first);

            (*osr_syms)[p.first] = new ConcreteCompilerVariable(phi_type, v, true);
        }

        if (guard_val) {
            // Create the guard with both branches leading to the success_bb,
            // and let the deopt path change the failure case to point to the
            // as-yet-unknown deopt block.
            // TODO Not the best approach since if we fail to do that patching,
            // the guard will just silently be ignored.
            llvm::BranchInst* br
                = entry_emitter->getBuilder()->CreateCondBr(guard_val, osr_unbox_block, osr_unbox_block);
            out_guards.registerGuardForBlockEntry(target_block, br, *initial_syms);
        } else {
            entry_emitter->getBuilder()->CreateBr(osr_unbox_block);
        }
        unbox_emitter->getBuilder()->CreateBr(llvm_entry_blocks[entry_descriptor->backedge->target]);

        for (const auto& p : *initial_syms) {
            delete p.second;
        }
        delete initial_syms;
    }

    // In a similar vein, we need to keep track of the exit blocks for each cfg block,
    // so that we can construct phi nodes later.
    // Originally I preallocated these blocks as well, but we can construct the phi's
    // after the fact, so we can just record the exit blocks as we go along.
    std::unordered_map<CFGBlock*, llvm::BasicBlock*> llvm_exit_blocks;

    ////
    // Main ir generation: go through each basic block in the CFG and emit the code

    std::unordered_map<CFGBlock*, SymbolTable*> ending_symbol_tables;
    std::unordered_map<CFGBlock*, ConcreteSymbolTable*> phi_ending_symbol_tables;
    typedef std::unordered_map<std::string, std::pair<ConcreteCompilerType*, llvm::PHINode*> > PHITable;
    std::unordered_map<CFGBlock*, PHITable*> created_phis;

    CFGBlock* initial_block = NULL;
    if (entry_descriptor) {
        initial_block = entry_descriptor->backedge->target;
    } else if (full_blocks.count(source->cfg->getStartingBlock())) {
        initial_block = source->cfg->getStartingBlock();
    }

    // The rest of this code assumes that for each non-entry block that gets evaluated,
    // at least one of its predecessors has been evaluated already (from which it will
    // get type information).
    // The cfg generation code will generate a cfg such that each block has a predecessor
    // with a lower index value, so if the entry block is 0 then we can iterate in index
    // order.
    // The entry block doesn't have to be zero, so we have to calculate an allowable order here:
    std::vector<std::pair<CFGBlock*, CFGBlock*> > traversal_order
        = computeBlockTraversalOrder(full_blocks, partial_blocks, initial_block);

    std::unordered_set<CFGBlock*> into_hax;
    for (int _i = 0; _i < traversal_order.size(); _i++) {
        CFGBlock* block = traversal_order[_i].first;
        CFGBlock* pred = traversal_order[_i].second;

        if (VERBOSITY("irgen") >= 1)
            printf("processing %s block %d\n", bb_type, block->idx);

        bool is_partial = false;
        if (partial_blocks.count(block)) {
            if (VERBOSITY("irgen") >= 1)
                printf("is partial block\n");
            is_partial = true;
        } else if (!full_blocks.count(block)) {
            if (VERBOSITY("irgen") >= 1)
                printf("Skipping this block\n");
            // created_phis[block] = NULL;
            // ending_symbol_tables[block] = NULL;
            // phi_ending_symbol_tables[block] = NULL;
            // llvm_exit_blocks[block] = NULL;
            continue;
        }

        std::unique_ptr<IRGenerator> generator(
            createIRGenerator(irstate, llvm_entry_blocks, block, types, out_guards, in_guards, is_partial));
        llvm::BasicBlock* entry_block_end = llvm_entry_blocks[block];
        std::unique_ptr<IREmitter> emitter(createIREmitter(irstate, entry_block_end));

        PHITable* phis = NULL;
        if (!is_partial) {
            phis = new PHITable();
            created_phis[block] = phis;
        }

        // Set initial symbol table:
        if (is_partial) {
            // pass
        } else if (block == source->cfg->getStartingBlock()) {
            assert(entry_descriptor == NULL);
            // number of times a function needs to be called to be reoptimized:
            static const int REOPT_THRESHOLDS[] = {
                10,    // INTERPRETED->MINIMAL
                250,   // MINIMAL->MODERATE
                10000, // MODERATE->MAXIMAL
            };

            assert(strcmp("opt", bb_type) == 0);

            if (ENABLE_REOPT && effort < EffortLevel::MAXIMAL && source->ast != NULL
                && source->ast->type != AST_TYPE::Module) {
                llvm::BasicBlock* preentry_bb
                    = llvm::BasicBlock::Create(g.context, "pre_entry", irstate->getLLVMFunction(),
                                               llvm_entry_blocks[source->cfg->getStartingBlock()]);
                llvm::BasicBlock* reopt_bb = llvm::BasicBlock::Create(g.context, "reopt", irstate->getLLVMFunction());
                emitter->getBuilder()->SetInsertPoint(preentry_bb);

                llvm::Value* call_count_ptr = embedConstantPtr(&cf->times_called, g.i64->getPointerTo());
                llvm::Value* cur_call_count = emitter->getBuilder()->CreateLoad(call_count_ptr);
                llvm::Value* new_call_count
                    = emitter->getBuilder()->CreateAdd(cur_call_count, getConstantInt(1, g.i64));
                emitter->getBuilder()->CreateStore(new_call_count, call_count_ptr);
                llvm::Value* reopt_test = emitter->getBuilder()->CreateICmpSGT(
                    new_call_count, getConstantInt(REOPT_THRESHOLDS[effort], g.i64));

                llvm::Value* md_vals[]
                    = { llvm::MDString::get(g.context, "branch_weights"), getConstantInt(1), getConstantInt(1000) };
                llvm::MDNode* branch_weights = llvm::MDNode::get(g.context, llvm::ArrayRef<llvm::Value*>(md_vals));

                llvm::BranchInst* guard = emitter->getBuilder()->CreateCondBr(
                    reopt_test, reopt_bb, llvm_entry_blocks[source->cfg->getStartingBlock()], branch_weights);

                emitter->getBuilder()->SetInsertPoint(reopt_bb);
                // emitter->getBuilder()->CreateCall(g.funcs.my_assert, getConstantInt(0, g.i1));
                llvm::Value* r = emitter->getBuilder()->CreateCall(g.funcs.reoptCompiledFunc,
                                                                   embedConstantPtr(cf, g.i8->getPointerTo()));
                assert(r);
                assert(r->getType() == g.i8->getPointerTo());

                llvm::Value* bitcast_r = emitter->getBuilder()->CreateBitCast(r, irstate->getLLVMFunction()->getType());

                std::vector<llvm::Value*> args;
                // bitcast_r->dump();
                for (llvm::Function::arg_iterator AI = irstate->getLLVMFunction()->arg_begin();
                     AI != irstate->getLLVMFunction()->arg_end(); AI++) {
                    // AI->dump();
                    args.push_back(&(*AI));
                }
                // printf("%ld\n", args.size());
                llvm::CallInst* postcall = emitter->getBuilder()->CreateCall(bitcast_r, args);
                postcall->setTailCall(true);
                if (rtn_type == VOID) {
                    emitter->getBuilder()->CreateRetVoid();
                } else {
                    emitter->getBuilder()->CreateRet(postcall);
                }

                emitter->getBuilder()->SetInsertPoint(llvm_entry_blocks[source->cfg->getStartingBlock()]);
            }
            generator->unpackArguments(arg_names, cf->sig->arg_types);
        } else if (entry_descriptor && block == entry_descriptor->backedge->target) {
            assert(block->predecessors.size() > 1);
            assert(osr_entry_block);
            assert(phis);

            for (const auto& p : entry_descriptor->args) {
                ConcreteCompilerType* analyzed_type;
                if (startswith(p.first, "!is_defined"))
                    analyzed_type = BOOL;
                else
                    analyzed_type = types->getTypeAtBlockStart(p.first, block);

                // printf("For %s, given %s, analyzed for %s\n", p.first.c_str(), p.second->debugName().c_str(),
                // analyzed_type->debugName().c_str());

                llvm::PHINode* phi = emitter->getBuilder()->CreatePHI(analyzed_type->llvmType(),
                                                                      block->predecessors.size() + 1, p.first);
                ConcreteCompilerVariable* var = new ConcreteCompilerVariable(analyzed_type, phi, true);
                generator->giveLocalSymbol(p.first, var);
                (*phis)[p.first] = std::make_pair(analyzed_type, phi);
            }
        } else if (pred == NULL) {
            assert(traversal_order.size() < source->cfg->blocks.size());
            assert(phis);
            assert(block->predecessors.size());
            for (int i = 0; i < block->predecessors.size(); i++) {
                CFGBlock* b2 = block->predecessors[i];
                assert(ending_symbol_tables.count(b2) == 0);
                into_hax.insert(b2);
            }

            const PhiAnalysis::RequiredSet& names = source->phis->getAllDefinedAt(block);
            for (const auto& s : names) {
                // TODO the list from getAllDefinedAt should come filtered:
                if (!source->liveness->isLiveAtEnd(s, block->predecessors[0]))
                    continue;

                // printf("adding guessed phi for %s\n", s.c_str());
                ConcreteCompilerType* type = types->getTypeAtBlockStart(s, block);
                llvm::PHINode* phi = emitter->getBuilder()->CreatePHI(type->llvmType(), block->predecessors.size(), s);
                ConcreteCompilerVariable* var = new ConcreteCompilerVariable(type, phi, true);
                generator->giveLocalSymbol(s, var);

                (*phis)[s] = std::make_pair(type, phi);

                if (source->phis->isPotentiallyUndefinedAfter(s, block->predecessors[0])) {
                    std::string is_defined_name = "!is_defined_" + s;
                    llvm::PHINode* phi
                        = emitter->getBuilder()->CreatePHI(g.i1, block->predecessors.size(), is_defined_name);
                    ConcreteCompilerVariable* var = new ConcreteCompilerVariable(BOOL, phi, true);
                    generator->giveLocalSymbol(is_defined_name, var);

                    (*phis)[is_defined_name] = std::make_pair(BOOL, phi);
                }
            }
        } else {
            assert(pred);
            assert(full_blocks.count(pred) || partial_blocks.count(pred));

            if (block->predecessors.size() == 1) {
                // If this block has only one predecessor, it by definition doesn't need any phi nodes.
                // Assert that the phi_st is empty, and just create the symbol table from the non-phi st:
                ASSERT(phi_ending_symbol_tables[pred]->size() == 0, "%d %d", block->idx, pred->idx);
                assert(ending_symbol_tables.count(pred));
                generator->copySymbolsFrom(ending_symbol_tables[pred]);
            } else {
                // With multiple predecessors, the symbol tables at the end of each predecessor should be *exactly* the
                // same.
                // (this should be satisfied by the post-run() code in this function)

                // With multiple predecessors, we have to combine the non-phi and phi symbol tables.
                // Start off with the non-phi ones:
                generator->copySymbolsFrom(ending_symbol_tables[pred]);

                // And go through and add phi nodes:
                ConcreteSymbolTable* pred_st = phi_ending_symbol_tables[pred];
                for (ConcreteSymbolTable::iterator it = pred_st->begin(); it != pred_st->end(); it++) {
                    // printf("adding phi for %s\n", it->first.c_str());
                    llvm::PHINode* phi = emitter->getBuilder()->CreatePHI(it->second->getType()->llvmType(),
                                                                          block->predecessors.size(), it->first);
                    // emitter->getBuilder()->CreateCall(g.funcs.dump, phi);
                    ConcreteCompilerVariable* var = new ConcreteCompilerVariable(it->second->getType(), phi, true);
                    generator->giveLocalSymbol(it->first, var);

                    (*phis)[it->first] = std::make_pair(it->second->getType(), phi);
                }
            }
        }

        generator->run(block);

        const IRGenerator::EndingState& ending_st = generator->getEndingSymbolTable();
        ending_symbol_tables[block] = ending_st.symbol_table;
        phi_ending_symbol_tables[block] = ending_st.phi_symbol_table;
        llvm_exit_blocks[block] = ending_st.ending_block;

        if (into_hax.count(block))
            ASSERT(ending_st.symbol_table->size() == 0, "%d", block->idx);
    }

    ////
    // Phi generation.
    // We don't know the exact ssa values to back-propagate to the phi nodes until we've generated
    // the relevant IR, so after we have done all of it, go back through and populate the phi nodes.
    // Also, do some checking to make sure that the phi analysis stuff worked out, and that all blocks
    // agreed on what symbols + types they should be propagating for the phis.
    for (CFGBlock* b : source->cfg->blocks) {
        PHITable* phis = created_phis[b];
        if (phis == NULL)
            continue;

        bool this_is_osr_entry = (entry_descriptor && b == entry_descriptor->backedge->target);

        const std::vector<GuardList::BlockEntryGuard*>& block_guards = in_guards.getGuardsForBlock(b);
        // printf("Found %ld guards for block %p, for %p\n", block_guards.size(), b, &in_guards);

        for (int j = 0; j < b->predecessors.size(); j++) {
            CFGBlock* b2 = b->predecessors[j];
            if (full_blocks.count(b2) == 0 && partial_blocks.count(b2) == 0)
                continue;

            // printf("%d %d %ld %ld\n", i, b2->idx, phi_ending_symbol_tables[b2]->size(), phis->size());
            compareKeyset(phi_ending_symbol_tables[b2], phis);
            assert(phi_ending_symbol_tables[b2]->size() == phis->size());
        }

        if (this_is_osr_entry) {
            compareKeyset(osr_syms, phis);
        }

        std::vector<IREmitter*> emitters;
        std::vector<llvm::BasicBlock*> offramps;
        for (int i = 0; i < block_guards.size(); i++) {
            compareKeyset(&block_guards[i]->symbol_table, phis);

            llvm::BasicBlock* off_ramp = llvm::BasicBlock::Create(g.context, "deopt_ramp", irstate->getLLVMFunction());
            offramps.push_back(off_ramp);
            llvm::BasicBlock* off_ramp_end = off_ramp;
            IREmitter* emitter = createIREmitter(irstate, off_ramp_end);
            emitters.push_back(emitter);

            block_guards[i]->branch->setSuccessor(1, off_ramp);
        }

        for (PHITable::iterator it = phis->begin(); it != phis->end(); it++) {
            llvm::PHINode* llvm_phi = it->second.second;
            for (int j = 0; j < b->predecessors.size(); j++) {
                CFGBlock* b2 = b->predecessors[j];
                if (full_blocks.count(b2) == 0 && partial_blocks.count(b2) == 0)
                    continue;

                ConcreteCompilerVariable* v = (*phi_ending_symbol_tables[b2])[it->first];
                assert(v);
                assert(v->isGrabbed());

                // Make sure they all prepared for the same type:
                ASSERT(it->second.first == v->getType(), "%d %d: %s %s %s", b->idx, b2->idx, it->first.c_str(),
                       it->second.first->debugName().c_str(), v->getType()->debugName().c_str());

                llvm_phi->addIncoming(v->getValue(), llvm_exit_blocks[b->predecessors[j]]);
            }

            if (this_is_osr_entry) {
                ConcreteCompilerVariable* v = (*osr_syms)[it->first];
                assert(v);
                assert(v->isGrabbed());

                ASSERT(it->second.first == v->getType(), "");
                llvm_phi->addIncoming(v->getValue(), osr_unbox_block);
            }

            for (int i = 0; i < block_guards.size(); i++) {
                GuardList::BlockEntryGuard* g = block_guards[i];
                IREmitter* emitter = emitters[i];

                CompilerVariable* unconverted = g->symbol_table[it->first];
                ConcreteCompilerVariable* v = unconverted->makeConverted(*emitter, it->second.first);
                assert(v);
                assert(v->isGrabbed());


                ASSERT(it->second.first == v->getType(), "");
                llvm_phi->addIncoming(v->getValue(), offramps[i]);

                // TODO not sure if this is right:
                unconverted->decvref(*emitter);
                delete v;
            }
        }

        for (int i = 0; i < block_guards.size(); i++) {
            emitters[i]->getBuilder()->CreateBr(llvm_entry_blocks[b]);
            delete emitters[i];
        }
    }

    for (CFGBlock* b : source->cfg->blocks) {
        if (ending_symbol_tables[b] == NULL)
            continue;

        for (SymbolTable::iterator it = ending_symbol_tables[b]->begin(); it != ending_symbol_tables[b]->end(); it++) {
            it->second->decvrefNodrop();
        }
        for (ConcreteSymbolTable::iterator it = phi_ending_symbol_tables[b]->begin();
             it != phi_ending_symbol_tables[b]->end(); it++) {
            it->second->decvrefNodrop();
        }
        delete phi_ending_symbol_tables[b];
        delete ending_symbol_tables[b];
        delete created_phis[b];
    }

    if (entry_descriptor) {
        for (const auto& p : *osr_syms) {
            delete p.second;
        }
        delete osr_syms;
    }
}

static void computeBlockSetClosure(BlockSet& full_blocks, BlockSet& partial_blocks) {
    if (VERBOSITY("irgen") >= 1) {
        printf("Initial full:");
        for (CFGBlock* b : full_blocks) {
            printf(" %d", b->idx);
        }
        printf("\n");
        printf("Initial partial:");
        for (CFGBlock* b : partial_blocks) {
            printf(" %d", b->idx);
        }
        printf("\n");
    }
    std::vector<CFGBlock*> q;
    BlockSet expanded;
    q.insert(q.end(), full_blocks.begin(), full_blocks.end());
    q.insert(q.end(), partial_blocks.begin(), partial_blocks.end());

    while (q.size()) {
        CFGBlock* b = q.back();
        q.pop_back();

        if (expanded.count(b))
            continue;
        expanded.insert(b);

        for (int i = 0; i < b->successors.size(); i++) {
            CFGBlock* b2 = b->successors[i];
            partial_blocks.erase(b2);
            full_blocks.insert(b2);
            q.push_back(b2);
        }
    }

    if (VERBOSITY("irgen") >= 1) {
        printf("Ending full:");
        for (CFGBlock* b : full_blocks) {
            printf(" %d", b->idx);
        }
        printf("\n");
        printf("Ending partial:");
        for (CFGBlock* b : partial_blocks) {
            printf(" %d", b->idx);
        }
        printf("\n");
    }
}
// returns a pointer to the function-info mdnode
static llvm::MDNode* setupDebugInfo(SourceInfo* source, llvm::Function* f, std::string origname) {
    int lineno = 0;
    if (source->ast)
        lineno = source->ast->lineno;

    llvm::DIBuilder builder(*g.cur_module);

    std::string fn = source->parent_module->fn;
    std::string dir = "";
    std::string producer = "pyston; git rev " STRINGIFY(GITREV);

    llvm::DIFile file = builder.createFile(fn, dir);
    llvm::DIArray param_types = builder.getOrCreateArray(llvm::None);
    llvm::DICompositeType func_type = builder.createSubroutineType(file, param_types);
    llvm::DISubprogram func_info = builder.createFunction(file, f->getName(), f->getName(), file, lineno, func_type,
                                                          false, true, lineno + 1, 0, true, f);

    // The 'variables' field gets initialized with a tag-prefixed array, but
    // a later verifier asserts that there is no tag.  Replace it with an empty array:
    func_info.getVariables()->replaceAllUsesWith(builder.getOrCreateArray(llvm::ArrayRef<llvm::Value*>()));

    llvm::DICompileUnit compile_unit
        = builder.createCompileUnit(llvm::dwarf::DW_LANG_Python, fn, dir, producer, true, "", 0);

    llvm::DIArray subprograms = builder.getOrCreateArray(&*func_info);
    compile_unit.getSubprograms()->replaceAllUsesWith(subprograms);

    compile_unit.getEnumTypes()->replaceAllUsesWith(builder.getOrCreateArray(llvm::ArrayRef<llvm::Value*>()));
    compile_unit.getRetainedTypes()->replaceAllUsesWith(builder.getOrCreateArray(llvm::ArrayRef<llvm::Value*>()));
    compile_unit.getGlobalVariables()->replaceAllUsesWith(builder.getOrCreateArray(llvm::ArrayRef<llvm::Value*>()));
    compile_unit.getImportedEntities()->replaceAllUsesWith(builder.getOrCreateArray(llvm::ArrayRef<llvm::Value*>()));
    return func_info;
}

static std::string getUniqueFunctionName(std::string nameprefix, EffortLevel::EffortLevel effort,
                                         const OSREntryDescriptor* entry) {
    static int num_functions = 0;

    std::ostringstream os;
    os << nameprefix;
    os << "_e" << effort;
    if (entry) {
        os << "_osr" << entry->backedge->target->idx << "_from_" << entry->cf->func->getName().data();
    }
    os << '_' << num_functions;
    num_functions++;
    return os.str();
}

CompiledFunction* doCompile(SourceInfo* source, const OSREntryDescriptor* entry_descriptor,
                            EffortLevel::EffortLevel effort, FunctionSignature* sig,
                            const std::vector<AST_expr*>& arg_names, std::string nameprefix) {
    Timer _t("in doCompile");

    if (VERBOSITY("irgen") >= 1)
        source->cfg->print();

    assert(g.cur_module == NULL);
    std::string name = getUniqueFunctionName(nameprefix, effort, entry_descriptor);
    g.cur_module = new llvm::Module(name, g.context);
    g.cur_module->setDataLayout(g.tm->getDataLayout()->getStringRepresentation());
    // g.engine->addModule(g.cur_module);

    ////
    // Initializing the llvm-level structures:

    int nargs = arg_names.size();
    ASSERT(nargs == sig->arg_types.size(), "%d %ld", nargs, sig->arg_types.size());

    std::vector<llvm::Type*> llvm_arg_types;
    if (entry_descriptor == NULL) {
        for (int i = 0; i < nargs; i++) {
            if (i == 3) {
                llvm_arg_types.push_back(g.llvm_value_type_ptr->getPointerTo());
                break;
            }
            llvm_arg_types.push_back(sig->arg_types[i]->llvmType());
        }
    } else {
        int arg_num = -1;
        for (const auto& p : entry_descriptor->args) {
            arg_num++;
            // printf("Loading %s: %s\n", p.first.c_str(), p.second->debugName().c_str());
            if (arg_num < 3)
                llvm_arg_types.push_back(p.second->llvmType());
            else {
                llvm_arg_types.push_back(g.llvm_value_type_ptr->getPointerTo());
                break;
            }
        }
    }

    llvm::FunctionType* ft = llvm::FunctionType::get(sig->rtn_type->llvmType(), llvm_arg_types, false /*vararg*/);

    llvm::Function* f = llvm::Function::Create(ft, llvm::Function::ExternalLinkage, name, g.cur_module);
    // g.func_registry.registerFunction(f, g.cur_module);

    CompiledFunction* cf
        = new CompiledFunction(f, sig, (effort == EffortLevel::INTERPRETED), NULL, NULL, effort, entry_descriptor);

    llvm::MDNode* dbg_funcinfo = setupDebugInfo(source, f, nameprefix);

    TypeAnalysis::SpeculationLevel speculation_level = TypeAnalysis::NONE;
    if (ENABLE_SPECULATION && effort >= EffortLevel::MODERATE)
        speculation_level = TypeAnalysis::SOME;
    TypeAnalysis* types = doTypeAnalysis(source->cfg, arg_names, sig->arg_types, speculation_level,
                                         source->scoping->getScopeInfoForNode(source->ast));

    GuardList guards;

    BlockSet full_blocks, partial_blocks;
    if (entry_descriptor == NULL) {
        for (CFGBlock* b : source->cfg->blocks) {
            full_blocks.insert(b);
        }
    } else {
        full_blocks.insert(entry_descriptor->backedge->target);
        computeBlockSetClosure(full_blocks, partial_blocks);
    }

    IRGenState irstate(cf, source, getGCBuilder(), dbg_funcinfo);

    emitBBs(&irstate, "opt", guards, GuardList(), types, arg_names, entry_descriptor, full_blocks, partial_blocks);

    // De-opt handling:

    if (!guards.isEmpty()) {
        BlockSet deopt_full_blocks, deopt_partial_blocks;
        GuardList deopt_guards;
        // typedef std::unordered_map<CFGBlock*, std::unordered_map<AST_expr*, GuardList::ExprTypeGuard*> > Worklist;
        // Worklist guard_worklist;

        guards.getBlocksWithGuards(deopt_full_blocks);
        for (const auto& p : guards.exprGuards()) {
            deopt_partial_blocks.insert(p.second->cfg_block);
        }

        computeBlockSetClosure(deopt_full_blocks, deopt_partial_blocks);

        assert(deopt_full_blocks.size() || deopt_partial_blocks.size());

        TypeAnalysis* deopt_types = doTypeAnalysis(source->cfg, arg_names, sig->arg_types, TypeAnalysis::NONE,
                                                   source->scoping->getScopeInfoForNode(source->ast));
        emitBBs(&irstate, "deopt", deopt_guards, guards, deopt_types, arg_names, NULL, deopt_full_blocks,
                deopt_partial_blocks);
        assert(deopt_guards.isEmpty());
        deopt_guards.assertGotPatched();

        delete deopt_types;
    }
    guards.assertGotPatched();

    for (const auto& p : guards.exprGuards()) {
        delete p.second;
    }

    delete types;

    if (VERBOSITY("irgen") >= 1) {
        printf("generated IR:\n");
        printf("\033[33m");
        fflush(stdout);
        dumpPrettyIR(f);
        // f->dump();
        // g.cur_module->dump();
        // g.cur_module->print(llvm::outs(), NULL);
        printf("\033[0m");
        fflush(stdout);
    } else {
        // Somehow, running this code makes it faster...?????
        // printf("\033[0m");
        // fflush(stdout);
    }

#ifndef NDEBUG
    if (!BENCH) {
        // Calling verifyFunction() confuses the profiler, which will end up attributing
        // a large amount of runtime to it since the call stack looks very similar to
        // the (expensive) case of compiling the function.
        llvm::verifyFunction(*f);
    }
#endif

    long us = _t.end();
    static StatCounter us_irgen("us_compiling_irgen");
    us_irgen.log(us);

    if (ENABLE_LLVMOPTS)
        optimizeIR(f, effort);

    bool ENABLE_IR_DEBUG = false;
    if (ENABLE_IR_DEBUG) {
        addIRDebugSymbols(f);
        // dumpPrettyIR(f);
    }

    g.cur_module = NULL;

    return cf;
}



} // namespace pyston