LSTM: adding the encoder's hidden states to th

I am trying to experiment with transferring the hidden states of an LSTM from an encoder layer to a decoder layer, as demonstrated in the Keras blog.

My data is randomly generated sine-waves (that is, wavelength and phase are determined randomly, as well as the length of the sequence), and the network is trained to receive a number of sine-waves and predict their progression.

Without transferring the hidden states, my code is as follows:

from keras.models import Model
from keras.layers import Input, LSTM, Dense, TimeDistributed,Lambda, Dropout, Activation ,RepeatVector
from keras.callbacks import ModelCheckpoint 
import numpy as np

features_num=5 

encoder_inputs = Input(shape=(None, features_num))
encoder = LSTM(40, return_state=False)
encoder_outputs= encoder(encoder_inputs)


decoder_input=RepeatVector(150)(encoder_outputs)

decoder_lstm = LSTM(40, return_sequences=True, return_state=True)
decoder_outputs,_,_=decoder_lstm(decoder_input)
decoder_outputs=TimeDistributed(Dense(features_num))(decoder_outputs)


model = Model(encoder_inputs, decoder_outputs)


print(model.summary())

model.compile(loss='mean_squared_error', optimizer='adam')

def create_wavelength(min_wavelength, max_wavelength, fluxes_in_wavelength, category )  :         
#category :: 0 - train ; 2 - validate ; 4- test. 1;3;5 - dead space
    c=(category+np.random.random())/6         
    k = fluxes_in_wavelength
#
    base= (np.trunc(k*np.random.random()*(max_wavelength-min_wavelength))       +k*min_wavelength)  /k
    answer=base+c/k
    return (answer)       

def make_line(length,category):
    shift= np.random.random()
    wavelength = create_wavelength(30,10,1,category)
    a=np.arange(length)
    answer=np.sin(a/wavelength+shift)
    return answer




def make_data(seq_num,seq_len,dim,category):
    data=np.array([]).reshape(0,seq_len,dim)
    for i in range (seq_num):
        mini_data=np.array([]).reshape(0,seq_len)
        for j in range (dim):
            line = make_line(seq_len,category)
            line=line.reshape(1,seq_len)            
            mini_data=np.append(mini_data,line,axis=0)
        mini_data=np.swapaxes(mini_data,1,0)
        mini_data=mini_data.reshape(1,seq_len,dim)      
        data=np.append(data,mini_data,axis=0)
    return (data)



def train_generator():
    while True:
        sequence_length = np.random.randint(150, 300)+150       
        data=make_data(1000,sequence_length,features_num,0) # category=0 in train
        x_train = data[:,:-150,:] # all but last 150        
        y_train = (data[:, -150:, :]) # last 150        
        yield x_train, y_train
def val_generator():
    while True:
        sequence_length = np.random.randint(150, 300)+150       
        data=make_data(1000,sequence_length,features_num,2) # category=2 in val
        x_val = data[:,:-150,:] # all but last 150      
        y_val = (data[:, -150:, :]) # last 150      
        yield x_val, y_val
def test_maker():
    if True:
        sequence_length = np.random.randint(150, 300)+150       
        data=make_data(1000,sequence_length,features_num,4) # category=4 in test
        x_test = data[:,:-150,:] # all but last 150         
        y_test = (data[:, -150:, :]) # last 150     
        return x_test, y_test


filepath_for_w= 'flux_vi_model.h5'  
checkpointer=ModelCheckpoint(filepath_for_w, monitor='val_loss', verbose=0, save_best_only=True, mode='auto', period=1)     
model.fit_generator(train_generator(),callbacks=[checkpointer], steps_per_epoch=30, epochs=1000, verbose=1,validation_data=val_generator(),validation_steps=30)
model.save('filepath_for_w')

x,y= test_maker()   


a=model.predict (x)
np.save ('a.npy',a)
np.save ('y.npy',y)
np.save ('x.npy',x)
print (np.mean(np.absolute(y-a)))

The result is the distance between the actual 150 points of the sine-wave vs. the predicted values.

For this code, I received a result of 0.065.

When I tried to make use of the hidden states of the LSTM, my results, to my surprise, have worsened. I use the same code, replacing the model with:

encoder_inputs = Input(shape=(None, features_num))
encoder = LSTM(40, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_input=RepeatVector(150)(encoder_outputs)

decoder_lstm = LSTM(40, return_sequences=True, return_state=True)
decoder_outputs,_,_=decoder_lstm(decoder_input, initial_state=encoder_states)
decoder_outputs=TimeDistributed(Dense(features_num))(decoder_outputs)

The result was 0.101, indicting a reduced ability to predict the continuation of the sine-wave when having access to the hidden states of the encoder.

Is my approach wrong in this case and the hidden states cannot be used to improve prediction? Or did I construct the model incorrectly?