python实现CNN中文文本分类
代码片段和文件信息
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
文件 14 2017-06-15 02:39 zh_cnn_text_classify-master.gitignore
文件 1917 2017-06-15 02:39 zh_cnn_text_classify-masterREADME.md
目录 0 2017-06-15 02:39 zh_cnn_text_classify-masterdata
文件 59489 2017-06-15 02:39 zh_cnn_text_classify-masterdataham_100.utf8
文件 44997 2017-06-15 02:39 zh_cnn_text_classify-masterdataspam_100.utf8
文件 4504 2017-06-15 02:39 zh_cnn_text_classify-masterdata_helpers.py
文件 4870 2017-06-15 02:39 zh_cnn_text_classify-mastereval.py
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpoints
文件 697 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointscheckpoint
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-200.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-200.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-200.meta
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-300.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-300.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-300.meta
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-400.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-400.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-400.meta
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-500.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-500.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-500.meta
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-600.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-600.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-600.meta
文件 46336 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581prediction.csv
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581summaries
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581summariesdev
文件 159244 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581summariesdevevents.out.tfevents.1492954586.escenter11PC
............此处省略7个文件信息
# encoding: UTF-8
import numpy as np
import re
import itertools
from collections import Counter
import os
import word2vec_helpers
import time
import pickle
def load_data_and_labels(input_text_file input_label_file num_labels):
x_text = read_and_clean_zh_file(input_text_file)
y = None if not os.path.exists(input_label_file) else map(int list(open(input_label_file “r“).readlines()))
return (x_text y)
def load_positive_negative_data_files(positive_data_file negative_data_file):
“““
Loads MR polarity data from files splits the data into words and generates labels.
Returns split sentences and labels.
“““
# Load data from files
positive_examples = read_and_clean_zh_file(positive_data_file)
negative_examples = read_and_clean_zh_file(negative_data_file)
# Combine data
x_text = positive_examples + negative_examples
# Generate labels
positive_labels = [[0 1] for _ in positive_examples]
negative_labels = [[1 0] for _ in negative_examples]
y = np.concatenate([positive_labels negative_labels] 0)
return [x_text y]
def padding_sentences(input_sentences padding_token padding_sentence_length = None):
sentences = [sentence.split(‘ ‘) for sentence in input_sentences]
max_sentence_length = padding_sentence_length if padding_sentence_length is not None else max([len(sentence) for sentence in sentences])
for sentence in sentences:
if len(sentence) > max_sentence_length:
sentence = sentence[:max_sentence_length]
else:
sentence.extend([padding_token] * (max_sentence_length - len(sentence)))
return (sentences max_sentence_length)
def batch_iter(data batch_size num_epochs shuffle=True):
‘‘‘
Generate a batch iterator for a dataset
‘‘‘
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
for epoch in range(num_epochs):
if shuffle:
# Shuffle the data at each epoch
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_idx = batch_num * batch_size
end_idx = min((batch_num + 1) * batch_size data_size)
yield shuffled_data[start_idx : end_idx]
def test():
# Test clean_str
print(“Test“)
#print(clean_str(“This‘s a huge dog! Who‘re going to the top.“))
# Test load_positive_negative_data_files
#x_texty = load_positive_negative_data_files(“./tiny_data/rt-polarity.pos“ “./tiny_data/rt-polarity.neg“)
#print(x_text)
#print(y)
# Test batch_iter
#batches = batch_iter(x_text 2 4)
#for batch in batches:
# print(batch)
def mkdir_if_not_exist(dirpath):
if not os.path.exists(dirpath):
os.mkdir(dirpath)
def seperate_line(line):
return ‘‘.join([word + ‘ ‘ for word in line])
def read_and_clean_zh_file(input_file output_cleaned_file =
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
文件 14 2017-06-15 02:39 zh_cnn_text_classify-master.gitignore
文件 1917 2017-06-15 02:39 zh_cnn_text_classify-masterREADME.md
目录 0 2017-06-15 02:39 zh_cnn_text_classify-masterdata
文件 59489 2017-06-15 02:39 zh_cnn_text_classify-masterdataham_100.utf8
文件 44997 2017-06-15 02:39 zh_cnn_text_classify-masterdataspam_100.utf8
文件 4504 2017-06-15 02:39 zh_cnn_text_classify-masterdata_helpers.py
文件 4870 2017-06-15 02:39 zh_cnn_text_classify-mastereval.py
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpoints
文件 697 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointscheckpoint
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-200.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-200.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-200.me
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-300.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-300.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-300.me
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-400.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-400.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-400.me
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-500.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-500.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-500.me
文件 2373156 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-600.data-00000-of-00001
文件 1009 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-600.index
文件 102143 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581checkpointsmodel-600.me
文件 46336 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581prediction.csv
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581summaries
目录 0 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581summariesdev
文件 159244 2017-06-15 02:39 zh_cnn_text_classify-master
uns1492954581summariesdevevents.out.tfevents.1492954586.escenter11PC
............此处省略7个文件信息
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件举报,一经查实,本站将立刻删除。
评论列表(条)