置顶 综合  分享代码“猜你喜欢”7.83398分 

yinjh 发表于 Jul 15, 2016 9:24:46 AM

 2922  32  15
# -*- coding: utf-8 -*-

import numpy as np
np.random.seed(2016)

import os
import glob
import math
import pickle
import datetime

from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge
from keras.models import Model

def load_train():
    X_train_uid=[]
    X_train_iid=[]
    Y_train_score=[]

    path = os.path.join('./data',  'train.csv')
    print('Read train data',path)

    f = open(path, 'r')
    line = f.readline()
    while (1):
        line = f.readline()
        if line == '':
            break
        arr = line.strip().split(',')
        X_train_uid.append(int(arr[0]))
        X_train_iid.append(int(arr[1]))
        Y_train_score.append(int(arr[2]))
    f.close()
    return X_train_uid,X_train_iid,Y_train_score

def load_test():
    X_test_uid=[]
    X_test_iid=[]

    path = os.path.join('./data',  'test.csv')
    print('Read test data',path)

    f = open(path, 'r')
    line = f.readline()
    while (1):
        line = f.readline()
        if line == '':
            break
        arr = line.strip().split(',')
        X_test_uid.append(int(arr[0]))
        X_test_iid.append(int(arr[1]))
    f.close()
    return X_test_uid,X_test_iid


X_train_uid,X_train_iid,Y_train_score = load_train()
#print len(X_train_uid),X_train_uid[33177260],max(X_train_uid)
#print len(X_train_iid),X_train_iid[33177260],max(X_train_iid)
#print len(Y_train_score),Y_train_score[33177260]
print "load train data OK."

X_test_uid,X_test_iid = load_test()
#print len(X_test_uid),X_test_uid[100],max(X_test_uid)
#print len(X_test_iid),X_test_iid[100],max(X_test_iid)
print "load test data OK."

# normalize train date
X_train_uid=np.array(X_train_uid)
X_train_uid=X_train_uid.reshape(X_train_uid.shape[0],1)

X_train_iid=np.array(X_train_iid)
X_train_iid=X_train_iid.reshape(X_train_iid.shape[0],1)

Y_train_score = np.array(Y_train_score).astype('float32')
Y_train_score = (Y_train_score - 1)/ 4

# normalize test date
X_test_uid=np.array(X_test_uid)
X_test_uid=X_test_uid.reshape(X_test_uid.shape[0],1)

X_test_iid=np.array(X_test_iid)
X_test_iid=X_test_iid.reshape(X_test_iid.shape[0],1)

# define model
input_1=Input(shape=(1,), dtype='int32')
input_2=Input(shape=(1,), dtype='int32')
x1=Embedding(output_dim=128, input_dim=223970, input_length=1)(input_1)
x2=Embedding(output_dim=128, input_dim=14726, input_length=1)(input_2)
x1=Flatten()(x1)
x2=Flatten()(x2)
x = merge([x1, x2], mode='concat')
x = Dropout(0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
out = Dense(1, activation='sigmoid')(x)
model = Model(input=[input_1, input_2], output=out)
model.compile(optimizer='rmsprop',
              loss='mean_squared_error',
              metrics=[])
# train model
model.fit([X_train_uid, X_train_iid], Y_train_score,
          nb_epoch=10, batch_size=1024*6)

# predict
Y_test_score = model.predict([X_test_uid, X_test_iid],batch_size=2048)
Y_test_score = Y_test_score * 4 + 1

f=open("out.csv","w")
f.write("score\n")
for i in range(Y_test_score.shape[0]):
    f.write("{:1.4f}".format(Y_test_score[i,0]))
    f.write("\n")
f.close()


32 回复

Thinker

DC币 1

Jul 15, 2016 9:28:57 AM

good job  !

1

yinjh

DC币 16

Jul 15, 2016 9:35:46 AM

 是我几天前的版本

现在可以在排行榜排第5名

欢迎任何人随意使用

通过改进和整合来进行提高

我现在第二,估计做不上去了

所以开源,看能帮别人不


2

yinjh

DC币 16

Jul 15, 2016 9:36:16 AM
采用了词嵌入和深度学习技术
3

兵荒马乱

DC币 0

Jul 15, 2016 9:48:05 AM
@yinjh  你这个代码我给你满分,不怕你骄傲
4

兵荒马乱

DC币 0

Jul 15, 2016 9:49:04 AM
在这个页面给个github的链接就好了
5

gaodahong

DC币 1

Jul 15, 2016 9:55:01 AM

大写的赞!

6

yuanyuan

DC币 0

Jul 15, 2016 10:01:15 AM

这个大爱啊,赶紧拿去实验一下,都不要阻止我怒上排行榜

7

story

DC币 0

Jul 15, 2016 10:08:25 AM

也看过大神之前的一次分享,小白表示崇拜,要是能讲解一下思路就好了

8

thinkers

DC币 0

Jul 15, 2016 10:15:34 AM

我认识目前排在第一的@ yes,boy!  ,但是你这个也相当不错了,赞!!

9

圆苹果

DC币 0

Jul 15, 2016 10:29:20 AM

意思是说,我直接用你这个代码,就能直接升到第5咯,哈哈哈~

10

yinjh

DC币 16

Jul 15, 2016 10:38:13 AM
# -*- coding: utf-8 -*-

import numpy as np
np.random.seed(2016)

import os
import glob
import math
import pickle
import datetime

from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge
from keras.models import Model

def load_train():
    X_train_uid=[]
    X_train_iid=[]
    Y_train_score=[]

    path = os.path.join('./data',  'train.csv')
    print('Read train data',path)

    f = open(path, 'r')
    line = f.readline()
    while (1):
        line = f.readline()
        if line == '':
            break
        arr = line.strip().split(',')
        X_train_uid.append(int(arr[0]))
        X_train_iid.append(int(arr[1]))
        Y_train_score.append(int(arr[2]))
    f.close()
    return X_train_uid,X_train_iid,Y_train_score

def load_test():
    X_test_uid=[]
    X_test_iid=[]

    path = os.path.join('./data',  'test.csv')
    print('Read test data',path)

    f = open(path, 'r')
    line = f.readline()
    while (1):
        line = f.readline()
        if line == '':
            break
        arr = line.strip().split(',')
        X_test_uid.append(int(arr[0]))
        X_test_iid.append(int(arr[1]))
    f.close()
    return X_test_uid,X_test_iid


X_train_uid,X_train_iid,Y_train_score = load_train()
#print len(X_train_uid),X_train_uid[33177260],max(X_train_uid)
#print len(X_train_iid),X_train_iid[33177260],max(X_train_iid)
#print len(Y_train_score),Y_train_score[33177260]
print "load train data OK."

X_test_uid,X_test_iid = load_test()
#print len(X_test_uid),X_test_uid[100],max(X_test_uid)
#print len(X_test_iid),X_test_iid[100],max(X_test_iid)
print "load test data OK."

# normalize train date
X_train_uid=np.array(X_train_uid)
X_train_uid=X_train_uid.reshape(X_train_uid.shape[0],1)

X_train_iid=np.array(X_train_iid)
X_train_iid=X_train_iid.reshape(X_train_iid.shape[0],1)

Y_train_score = np.array(Y_train_score).astype('float32')
Y_train_score = (Y_train_score - 1)/ 4

# normalize test date
X_test_uid=np.array(X_test_uid)
X_test_uid=X_test_uid.reshape(X_test_uid.shape[0],1)

X_test_iid=np.array(X_test_iid)
X_test_iid=X_test_iid.reshape(X_test_iid.shape[0],1)

# define model
input_1=Input(shape=(1,), dtype='int32')
input_2=Input(shape=(1,), dtype='int32')
x1=Embedding(output_dim=128, input_dim=223970, input_length=1)(input_1)
x2=Embedding(output_dim=128, input_dim=14726, input_length=1)(input_2)
x1=Flatten()(x1)
x2=Flatten()(x2)
x = merge([x1, x2], mode='concat')
x = Dropout(0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
out = Dense(1, activation='sigmoid')(x)
model = Model(input=[input_1, input_2], output=out)
model.compile(optimizer='rmsprop',
              loss='mean_squared_error',
              metrics=[])
# train model
model.fit([X_train_uid, X_train_iid], Y_train_score,
          nb_epoch=10, batch_size=1024*6)

# predict
Y_test_score = model.predict([X_test_uid, X_test_iid],batch_size=2048)
Y_test_score = Y_test_score * 4 + 1

f=open("out.csv","w")
f.write("score\n")
for i in range(Y_test_score.shape[0]):
    f.write("{:1.4f}".format(Y_test_score[i,0]))
    f.write("\n")
f.close()


11

yinjh

DC币 16

Jul 15, 2016 10:42:21 AM

楼上这个代码格式对了

12

yinjh

DC币 16

Jul 15, 2016 10:44:28 AM
思路很简单

程序也很简洁

把用户和商品都嵌入到128的空间,再过几层神经元网络,用打分数据训练

词向量的思路

13

yinjh

DC币 16

Jul 15, 2016 10:47:06 AM

可改进的空间还很大,目前我第二名的程序和这个差不多的

可以做各种结构和超参数的优化

我希望各位抓紧时间,超越"yes boy"

各位也可以用这个模型的结果和自己的结果做融合

14

yinjh

DC币 16

Jul 15, 2016 10:48:50 AM

运行速度也很快的,如果你有GPU

15

圆苹果

DC币 0

Jul 15, 2016 10:55:05 AM

i5  5代的集显跑得快吗

16

yinjh

DC币 16

Jul 15, 2016 10:57:05 AM

开源是无条件的,但我希望:

1 对这个程序的改进,也可以贴在这里开源(不是必须的)

2 有人能通过对这个程序的改进,得到更好的名次时,告诉我一下。(不是必须的)


17

yinjh

DC币 16

Jul 15, 2016 11:02:04 AM
我用了5个小时

从0开始,做完这个程序

18

yinjh

DC币 16

Jul 15, 2016 11:09:22 AM
这个程序和train数据集里的时间列无关

都没读取时间列

我觉得时间列就是个坑

19

yinjh

DC币 16

Jul 15, 2016 11:15:36 AM
我做到第二都不需要时间列

如果第一名用了

我估计我就差在这上面了


20

yinjh

DC币 16

Jul 15, 2016 11:17:45 AM
21

jayr110

DC币 0

Jul 15, 2016 2:04:51 PM

点赞,研究中

22

yinjh

DC币 16

Jul 15, 2016 4:39:00 PM

这个程序还可以做用户的相似度衡量或者商品的相似度衡量

23

yinjh

DC币 16

Jul 16, 2016 9:07:13 AM

我知道至少有2位网友已经顺利运行了这个程序了

DL最好用GPU啊,否则比较慢的

24

gaodahong

DC币 1

Jul 16, 2016 11:02:52 AM

@yinjh大神,给我们讲讲思路呗

25

deep_wind

DC币 0

Jul 19, 2016 9:57:24 AM

非常感谢分享~~如果顺带给点思路应该会帮助更大~~仅仅是VGG调参获取的效果吗?

26

yhfgg

DC币 1

Jul 22, 2016 3:45:20 PM

敢问楼主跑这个程序需要多久,用了什么配置的电脑?


27

不是猫

DC币 0

Aug 18, 2016 9:49:21 PM

@yinjh

感谢大神分享!这个程序在我电脑可以完美运行。想问两个相关问题,望解惑。

1,两个Embedding层的 output_dim=128 是怎么来的呢?

2,两个Dense层节点数 512 和 64 又是怎么来的呢?

如果这几个数字都是试出来的,那您在试这几个参数的时候都是从什么数字试起,已怎样的跨度试的呢?

再次感谢!


28

R大大

DC币 0

Nov 20, 2016 5:43:40 PM
@thinkers 你好,能给下YESBOY的联系方式吗@Yes,boy!
29

猪猪侠

DC币 4

Dec 14, 2016 10:49:24 AM
@yinjh您好,好久没有登录这个论坛了。能告诉我你的代码是那个赛题的吗??不胜感激
30

Gentleee

DC币 0

Feb 5, 2017 6:59:38 PM

腻害,作为新手学习一下~~

31

王露茜

DC币 0

Feb 10, 2017 3:53:38 PM
@yinjh能问一下这个数据起步是怎么处理的吗,电脑读入相当困难,无法处理,本人小白,,
32
用户
反馈