importjava.io.BufferedReader;importjava.io.BufferedWriter;importjava.io.File;importjava.io.FileReader;importjava.io.FileWriter;importjava.io.IOException;importjava.util.HashMap;importjava.util.Iterator;importjava.util.Map;/***@authorgjf
*db_pre.arff存储的是从xml文件中读取的xml信息*/
public classElmAuth {
Map map = new HashMap();//第一步//从xml文件中提取 读取xml文件中的author信息,写到db_pre.arff,并且替换特殊字符
public void settleXml(String src, String dst){//src=dblp.xml dst=db_pre.arff
File file = newFile(src);
File fl= newFile(dst);
FileReader fr;try{
fr= newFileReader(file);
FileWriter fw= newFileWriter(fl);
BufferedReader br= newBufferedReader(fr);
BufferedWriter bw= newBufferedWriter(fw);
String line= null;boolean flag = true;intloc_st;intloc_end;int len = 0, max = 0;while((line = br.readLine()) != null){if(line == null)break;
loc_st= line.indexOf("");if(loc_st != -1){
loc_end= line.indexOf("");
line= line.substring(loc_st + 8, loc_end);//在之间的数据,一个作者的名字
line=line.replace('&', ' ');
line=line.replace('$', ' ');
line=line.replace("' "," ");
line=line.replace("'", " ");/*flag以文章为界限,在同一篇文章内,flag=false,写入在同一行*/
if(flag){
bw.write("\n");
bw.write(line);
}else{
bw.write(",");
bw.write(line);
}
len++;//每写一个作者,计数加 +1
flag = false;
}else{
flag= true;if(max < len) max = len;//选择最大的len;
len = 0;
bw.flush();
}
}
System.out.println("第一步 论文中具有最大的作者数:" +max);
}catch(IOException e) {
e.printStackTrace();
}
}//消除只有单个作item//第二步:将作者的信息db_pre.arff中只有一个作者的数据删除
public void elimate_one(String src, String dst){//src=db_pre.arff dst=db_elone.arff
try{
File file= newFile(src);
FileReader fr= newFileReader(file);
BufferedReader br= newBufferedReader(fr);
File filew= newFile(dst);
FileWriter fw= newFileWriter(filew);
BufferedWriter bw= newBufferedWriter(fw);
Map map = new HashMap();
String line= null;int k = 1;int res = 0;while((line = br.readLine()) != null){
String[] arrLine= line.split(",");//作者之间用","隔离,","的数量表示作者的个数,数量比一少,则不写入.
if(arrLine.length > 1){
bw.write(line);
bw.write("\n");
res++;
}
}
bw.flush();
br.close();
bw.close();
fr.close();//System.out.println("The Row of the file is:" + res);
System.out.println("这篇论文中去除单个作者后的行数:" +res);
}catch(IOException e) {
e.printStackTrace();
}
}//将剩余的作储再hashMap中,key值为人名,value为出现的次数,支持度数
public void createMap(String src){//srr=db_elone.arff
try{
File file= newFile(src);
FileReader fr= newFileReader(file);
BufferedReader br= newBufferedReader(fr);
String line= null;while((line = br.readLine()) != null){if(line == null)break;
String[] arrLine= line.split(",");for(int i = 0; i < arrLine.length; ++i){if(map.get(arrLine[i]) == null){
map.put(arrLine[i],1);
}else{
map.put(arrLine[i], map.get(arrLine[i])+ 1);
}
}
}
fr.close();
br.close();
}catch(IOException e) {
e.printStackTrace();
}
}//从hashMap中删除小于支持度minsup的作者,本次的支持度数为100;
public void settleMap(intminsup){
Iterator it=map.keySet().iterator();while(it.hasNext()){
String str=(String) it.next();if(map.get(str)
it.remove();
}
}
System.out.println("Map的大小,支持度大于100的作者个数:" +map.size());
}//将大于minsup的作者存储到文件 db_minsup.arff,存储的是符合筛选的作者
public void updateMap(String src, String dst){//src=db_elone.arff dst=db_minsup.arff
try{
File filer= newFile(src);
FileReader fr= newFileReader(filer);
BufferedReader br= newBufferedReader(fr);
File filew= newFile(dst);
FileWriter fw= newFileWriter(filew);
BufferedWriter bw= newBufferedWriter(fw);
String line= null;int res = 0;boolean flag = true;while((line = br.readLine()) != null){if(line == null)break;
String[] arrLine= line.split(",");if(flag == false)res++;
flag= true;for(int i = 0; i < arrLine.length; ++i){if(map.get(arrLine[i]) != null){if(flag == true){
bw.write("\n" +arrLine[i]);
flag= false;
}else{
bw.write("," +arrLine[i]);
}
}
}
}
bw.flush();
System.out.println("符合筛选的作者合作写的论文篇数:" +res);
fw.close();
bw.close();
fr.close();
br.close();
}catch(IOException e) {
e.printStackTrace();
}
}//生成weka识别的文 dst=db
public void createWekaFile(String src, String dst){//src=db_minsup.arff dst=db
try{
File filer= newFile(src);
FileReader fr= newFileReader(filer);
BufferedReader br= newBufferedReader(fr);
File filew= newFile(dst);
FileWriter fw= newFileWriter(filew);
BufferedWriter bw= newBufferedWriter(fw);
bw.write("@relation db" + "\n");
Iterator it=map.keySet().iterator();while(it.hasNext()){
String str=(String) it.next();
str.replace("'", "\'");
bw.write("@attribute '" + str + "' { t}\n");
}
bw.write("@data" + "\n");
String line= null;boolean flag = true;while((line = br.readLine()) != null){if(line == null)break;
flag= true;charch;
it=map.keySet().iterator();while(it.hasNext()){
String str=(String)it.next();if(line.indexOf(str) >= 0){
ch= 't';
}else{
ch= '?';
}if(flag == true){
bw.write(ch);
}else{
bw.write("," +ch);
}
flag= false;
}
bw.write("\n");
}
bw.flush();
fw.close();
bw.close();
fr.close();
br.close();
}catch(IOException e) {//TODO Auto-generated catch block
e.printStackTrace();
}
}public voidclearMap(){
map.clear();
}public static voidmain(String args[]){
ElmAuth elmauth= newElmAuth();
elmauth.settleXml("dblp.xml", "db_pre.arff");
elmauth.elimate_one("db_pre.arff", "db_elone.arff");
elmauth.createMap("db_elone.arff");
elmauth.settleMap(100);//确定最小支持度数
elmauth.updateMap("db_elone.arff", "db_minsup.arff");for(int i = 0; i < 20; ++i){
System.out.println();
elmauth.elimate_one("db_minsup.arff", "db_minsup_elone.arff");
elmauth.clearMap();
elmauth.createMap("db_minsup_elone.arff");
elmauth.settleMap(100);
elmauth.updateMap("db_minsup_elone.arff", "db_minsup.arff");
}
elmauth.createWekaFile("db_minsup.arff", "db.arff");
}
}