Different learning curve of Adam between Tensorflo

I'm currently changing my code from Keras to Tensorflow in order to use the new feature of quantized training in Tensorflow 1.10.0. However, I found out that the training process in Keras and Tensorflow shows very large difference when using Adam optimizer.

Here is the code for practice usage, which aims on the same purpose to train a "sin(10x)" function in Tensorflow and Keras way.

from keras.layers import Input, Dense, BatchNormalization
from keras.models import Model
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K

KERAS = 'keras'
TENSORFLOW = 'tensorflow'


def create_model():
    ipt = Input([1])
    m = Dense(1000, activation='relu')(ipt)
    m = BatchNormalization()(m)
    m = Dense(1000, activation='relu')(m)
    m = BatchNormalization()(m)
    m = Dense(1)(m)
    return Model(ipt, m)


valX = np.expand_dims(np.linspace(-1, 1, 10000), 1)
valY = np.sin(valX * 10)
valY_ = {}

for phase in (KERAS, TENSORFLOW):
    sess = tf.Session()
    sess.as_default()
    K.set_session(sess)

    model = create_model()

    if phase is KERAS:
        model.compile('adam', 'mean_squared_error')
    else:
        tensor_y_gt = tf.placeholder(dtype=tf.float32, shape=model.output.get_shape().as_list())
        mse = tf.losses.mean_squared_error(model.output, tensor_y_gt)
        training_steps = tf.train.AdamOptimizer().minimize(mse)
        sess.run(tf.global_variables_initializer())

    for step in range(2000):
        X = np.random.uniform(-1, 1, [256, 1])
        Y = np.sin(X * 10)

        if phase is KERAS:
            loss = model.train_on_batch(X, Y)
        else:
            loss, _ = sess.run([mse, training_steps], feed_dict={model.input: X, tensor_y_gt: Y})

        if step % 100 == 0:
            print('%s, step#%d, loss=%.5f' % (phase, step, loss))

    valY_[phase] = model.predict(valX)[:, 0]

    sess.close()

valX = valX[:, 0]
valY = valY[:, 0]

plt.plot(valX, valY, 'r--', label='sin(10x)')
plt.plot(valX, valY_[KERAS], 'g-', label=KERAS)
plt.plot(valX, valY_[TENSORFLOW], 'b-', label=TENSORFLOW)
plt.legend(loc='best', ncol=1)
plt.show()

You can see the difference between the two: plot of sin(10x)