import os, sys, json, traceback, zipfile
import os.path as osp
import logging
import shutil

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

logging.info("-" * 20)
logging.info("Dataiku Python entrypoint starting up")
logging.info("executable = %s" % sys.executable) 
logging.info("argv = %s" % sys.argv) 
logging.info("-" * 20)

is_databricks = False
if len(sys.argv) > 1 and sys.argv[1] == "with-dku-databricks-shim":
    is_databricks= True
    dbfs_run_folder = sys.argv[2]
    server_host = sys.argv[3]
    server_port = sys.argv[4]
    server_kind = sys.argv[5]
    api_ticket = sys.argv[6]
    execution_id = sys.argv[7]

    args_to_remove = 8

    logging.info("Running a PySpark Databricks activity for execution id %s in folder %s" % (execution_id, dbfs_run_folder))

    if len(sys.argv) > 8 and sys.argv[8] == "notebook-subprocess":
        sys.i_am_the_crazy_subprocess = True
        print("I am in Databricks notebook subprocess: %s"% sys.executable)
        print("My argv: %s" % sys.argv)
        # We are in the subprocess, do the horrible dance
        python_shell_path = sys.argv[9]
        gw_port = int(sys.argv[10])
        gw_secret = sys.argv[11]

        args_to_remove=12

        # Let's do some bondage with the SparkContext
        import sys
        sys.path.append(python_shell_path)
        import PythonShell

        import pyspark

        print ("Establishing connection to gateway at port=%s secret=%s" % (gw_port, gw_secret))

        gateway = PythonShell.get_existing_gateway(jvm_port=gw_port,
                                       auto_convert=True,
                                       auth_token=gw_secret)
        conf = pyspark.SparkConf(_jconf=gateway.entry_point.getSparkConf())
        sc = PythonShell.RemoteContext(gateway=gateway, conf=conf)
        sqlContext = pyspark.HiveContext(sc, gateway.entry_point.getSQLContext())

    jvm = sc._jvm
    entryPoint = jvm.__getattr__('com.dataiku.dip.spark.databricks.DKUDatabricksEntrypoint')
    tmp_dir = entryPoint.onPysparkInit(dbfs_run_folder, server_host, server_port, server_kind, api_ticket, execution_id)

    logging.info("Got entry point reply: %s" % tmp_dir)
    logging.info("Entrypoint called, rebuilding arguments")

    # Remove our magic arguments before continuing
    sys.original_wrapper_argv = sys.argv
    new_args = [sys.argv[0]]
    new_args.extend(sys.argv[args_to_remove:])
    sys.argv = new_args

    #os.chdir("/dbfs/%s" % dbfs_run_folder)
    os.chdir(tmp_dir)

    logging.info("After entrypoint, argv=%s" % sys.argv)
    
# next special case: pyspark running inside a k8s pod
# we need to fetch the files before the script starts its ugly dance with sys.path to add packages
if os.path.exists('/etc/initial-fetch-request/initial-fetch-request.json'):
    logging.info("Running pyspark-over-k8s in cluster mode")
    import pyspark
    pyspark.SparkContext._ensure_initialized()
    jvm = pyspark.SparkContext._jvm
    entryPoint = jvm.__getattr__('com.dataiku.dip.spark.submit.DKUSparkKubernetesEntrypoint')
    entryPoint.doFetchFromPySpark()
    
    # add the python packages so that they are available for imports even before the SparkContext starts
    with open('/etc/initial-fetch-request/initial-fetch-request.json', 'r') as f:
        fr = json.load(f)
    logging.info("fetch request is %s" % fr)
    for p in fr.get("pathsToPyDistribute", []):
        sys.path.append(p)

dku_exec_env = None
for p in (['.'] + sys.path): # cwd is of course the preferred location for dku-exec-env.json
    test_path = os.path.join(p, "remote-run-env-def.json")
    logging.info("Looking for RemoteRunEnvDef in %s" % test_path)
    if os.path.exists(test_path) and os.path.isfile(test_path):
        logging.info("Found RemoteRunEnvDef environment: %s " % test_path)
        with open(test_path, "r") as f:
            dku_exec_env = json.load(f)
            break

folders_to_delete_after_job = []

# because putting unicode in os.environ fails (python2, obviously)
def safe_os_environ_update(added):
    if sys.version_info > (3,0):
        os.environ.update(added)
    else:
        for k in added:
            v = added[k]
            # do the value
            if v is not None and isinstance(v, unicode):
                v = v.encode('utf8')
            # do the key
            if k is not None and isinstance(k, unicode):
                k = k.encode('utf8')
            os.environ[k] = v

if dku_exec_env is not None and dku_exec_env['runsRemotely']:    
    logging.info("Running a DSS Python recipe remotely")
    
    # running in yarn-cluster mode
    runs_remotely = True
    
    # file has been distributed, so it's in the local dir now
    script_file = os.path.basename(sys.argv[1])

    logging.info("Will execute script file: %s" % script_file)

    # setup env according to what we got
    job_id = dku_exec_env['jobId']
    safe_os_environ_update(dku_exec_env['env']) # a bit gruik

    # Unzip packages that we received by zips

    unzip_target_path = "."
    if is_databricks:
        # For Databricks, the "." is the run path, which is on DBFS,
        # so far too slow to work directly off it
        # Instead, we create a temporary file for that
        import tempfile
        unzip_target_path = tempfile.mkdtemp()
        folders_to_delete_after_job.append(unzip_target_path)

    logging.info("Unzipping Python libraries and packages")

    unzipped_packages_path = os.path.join(unzip_target_path, '__dku_pyFiles')
    sys.path.append(unzipped_packages_path) # a bit gruik

    for package in dku_exec_env['python']['packages']:
        package_name = package['key']
        package_folder = package['value']
        logging.info("Unzipping package: %s mapped to %s" % (package_name, package_folder))
        if package_name == 'instance-python-lib':
            local_package_folder = os.path.join(unzip_target_path, '__dku_pyLib')
            sys.path.append(local_package_folder)
        elif package_name == 'project-libs':
            local_package_folder = os.path.join(unzip_target_path, '__dku_projectLibs')
        else:
            local_package_folder = os.path.join(unzipped_packages_path, package_name)
        zip_name = dku_exec_env['zippedFolders'][package_folder]
        logging.info("Unzipping %s to %s" % (zip_name, local_package_folder))
        z = zipfile.ZipFile(zip_name)
        z.extractall(path=local_package_folder)

        if package_name == "project-libs":
            print("Post-processing project-libs")
            with open(osp.join(local_package_folder, "project-lib-paths.json")) as f:
                project_libs_paths = json.load(f)
            for lib_subpath in project_libs_paths["pythonPath"]:
                full_path = osp.join(local_package_folder, lib_subpath)
                print("Adding %s to Pythonpath" % full_path)
                sys.path.append(full_path)

    logging.info("Done unzipping Python libraries and packages")
        
    job_cwd = None
else:
    logging.info("Running a DSS Python recipe locally, uinsetting env")


    runs_remotely = False
    script_file = sys.argv[1]
    if dku_exec_env is not None:
        job_cwd = dku_exec_env.get('cwd', None)
        safe_os_environ_update(dku_exec_env['env']) # a bit gruik
        for p in dku_exec_env.get("python", {}).get("pythonPathChunks", []):
            sys.path.append(p) # a bit gruik
    else:
        job_cwd = None

logging.info("Setup complete, ready to execute Python code")
logging.info("Sys path: %s" % (sys.path,))
logging.info("Script file: %s" % script_file)

import dataiku
import dataiku.core
import dataiku.core.flow


# Note: this does not use the regular ErrorMonitoringWrapper because
# it does much more stuff, such as sending the error by HTTP in the 
# Livy / Databricks cases
try:
    with open(script_file) as f:
        exec(f.read())
except Exception as e:
    (type, value, tb) = sys.exc_info()
    sys.stderr.write("*************** Recipe code failed **************\n")
    sys.stderr.write("Begin Python stack\n") # Smart log marker
    traceback.print_exc()
    sys.stderr.write("End Python stack\n") # Smart log marker

    from dataiku.base.utils import safe_unicode_str

    additional_prefix = ""
    
    # In Databricks, there is an additional <string> in <module> because of their REPL. We skip it
    # to find the real error line
    string_modules_to_skip = is_databricks and 1 or 0
    skipped_string_modules = 0
    while tb is not None:
        if tb.tb_frame is not None and tb.tb_frame.f_code is not None:
            if tb.tb_frame.f_code.co_filename == "<string>" and tb.tb_frame.f_code.co_name == "<module>":
                if skipped_string_modules < string_modules_to_skip:
                    skipped_string_modules += 1
                else:
                    additional_prefix = "At line %s: " % tb.tb_lineno
                    break
        tb = tb.tb_next

    err = {
        "detailedMessage" : "%s%s: %s" % (additional_prefix, safe_unicode_str(type), safe_unicode_str(value)),
        "errorType" : safe_unicode_str(type),
        "message" : safe_unicode_str(value)
    }

    if runs_remotely:
        # send over http
        from dataiku.core import flow
        from dataiku.core.intercom import jek_or_backend_json_call,jek_or_backend_void_call
        from dataiku.core import dkuio

        err["additionalInformationToLog"] = safe_unicode_str(traceback.format_exc())

        print("Runs remotely, got flow=%s" % (dir(flow),))

        logging.info("Sending error.json to JEK or Backend")
        
        flow.load_flow_spec()
        f = dkuio.new_bytesoriented_io(json.dumps(err))
        jek_or_backend_void_call('containers/put-file-multipart',
                                 params={'executionId': job_id, 'fileKind': 'FILTERED_PATHS', 'path': "error.json", 'expand': 'false'},
                                 files={'file':("error.json", f, 'text/json')})
        print("Sent error.json : %s" % err)
    else:
        # simply write to local filesystem
        error_json_path = os.path.join(job_cwd, "error.json") if job_cwd is not None else "error.json"
        with open(error_json_path, "w") as f:
            f.write(json.dumps(err))
    sys.exit(1)
finally:
    try:
        # if we are a spark thing, make sure we tear down the SparkContext on our way out.
        # needed for spark-over-k8s, unless you want to leak pods
        if not is_databricks: # for obvious reasons
            try:
                logging.info("Check if spark is available")
                from pyspark.context import SparkContext
                logging.info("Check if a spark context is active")
                sc = SparkContext._active_spark_context
                if sc is not None:
                    logging.info("Stopping the spark context")
                    sc.stop()
            except Exception as e:
                logging.info("Not stopping a spark context: %s" % str(e))
    finally:
        for dir in folders_to_delete_after_job:
            shutil.rmtree(dir)
