Skip to content

Commit 6d0e4aa

Browse files
committed
codes related to indexing/ searching
Indexing of below tables and related searches 1) SELECT page_id, page FROM `page` WHERE `page_namespace`=14 2) SELECT cl_from, cl_to FROM `categorylinks`
1 parent aae3106 commit 6d0e4aa

File tree

10 files changed

+626
-88
lines changed

10 files changed

+626
-88
lines changed

WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryDB.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ public static int getCategoryPageCount( int threshold )
3232
ResultSet rs = null;
3333
int updateQuery = 0;
3434

35-
String query = "SELECT COUNT(*) FROM `category` WHERE `cat_subcats`=0 AND `cat_pages`>0 AND `cat_pages`< ? ";
35+
String query = "SELECT COUNT(*) FROM `page_category` WHERE `cat_subcats`=0 AND `cat_pages`< ? ";
3636

3737

3838
try
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
/*
2+
* To change this template, choose Tools | Templates
3+
* and open the template in the editor.
4+
*/
5+
/**
6+
* KarshaAnnotate- Annotation tool for financial documents
7+
*
8+
* Copyright (C) 2013, Lanka Software Foundation and and University of Maryland.
9+
*
10+
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General
11+
* Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
12+
* later version.
13+
*
14+
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
15+
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
16+
* details.
17+
*
18+
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see
19+
* <http://www.gnu.org/licenses/>.
20+
*
21+
* Date Author Changes Aug 13, 2013 Kasun Perera Created
22+
*
23+
*/
24+
package org.dbpedia.kasun.categoryprocessor;
25+
26+
27+
import java.io.*;
28+
import java.sql.Connection;
29+
import java.sql.PreparedStatement;
30+
import java.sql.ResultSet;
31+
import java.sql.SQLException;
32+
import java.util.ArrayList;
33+
import org.apache.lucene.queryparser.classic.ParseException;
34+
import org.dbpedia.kasun.searcher.Search;
35+
36+
/**
37+
* TODO- describe the purpose of the class
38+
*
39+
*/
40+
public class CategoryLinksDB
41+
{
42+
43+
public static void getCategoryByPageID() throws IOException
44+
{
45+
DB_connection con = new DB_connection();
46+
Connection connection = con.dbConnect();
47+
48+
FileWriter outFile;
49+
FileWriter outFile1;
50+
int pageID;
51+
String leafcategory;
52+
53+
54+
PreparedStatement ps = null;
55+
ResultSet rs = null;
56+
int updateQuery = 0;
57+
String temp = null;
58+
59+
60+
61+
62+
// System.out.println(line);
63+
// System.out.println(temp);
64+
65+
// String query = "SELECT cl_to FROM `categorylinks` WHERE `cl_from` = ? ";
66+
67+
// String query = "SELECT `cl_to` FROM `category_only_page` JOIN `categorylinks` ON `category_only_page`.`page_id` = `categorylinks`.`cl_from` WHERE `page_title` = '"+leafcategory+"'";
68+
69+
try
70+
{
71+
72+
73+
File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\page_id_page_title_leaf_categories_page_less_than_90.txt" );
74+
75+
String line;
76+
BufferedReader fileReader;
77+
fileReader = new BufferedReader( new FileReader( catPagesFile ) );
78+
//FileWriter outFile;
79+
// FileWriter outFileCatNotFound;
80+
81+
while ( ( line = fileReader.readLine() ) != null )
82+
{
83+
if ( !line.isEmpty() )
84+
{
85+
String splitLine[] = line.split( "\t" );
86+
leafcategory = splitLine[1].trim();
87+
pageID = Integer.valueOf( splitLine[0] );
88+
89+
String query = "SELECT `cl_to` FROM `categorylinks` WHERE `cl_from` = " + splitLine[0].trim();
90+
91+
92+
93+
94+
ps = connection.prepareStatement( query );
95+
// ps.setString( 1, temp );
96+
// ps.setInt( 1, pageID );
97+
rs = ps.executeQuery();
98+
int count = 0;
99+
100+
if ( rs.next() )
101+
{
102+
NodeDB.insertNode( pageID, leafcategory );
103+
// int childID= NodeDB.getCategoryId( leafcategory );
104+
do
105+
{
106+
//outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\category_match_article_pages.txt", true );
107+
//outFile.append( rs.getString( "cat_id" ) + "\t" + rs.getString( "cat_title" ) + "\t" + rs.getString( "cat_pages" ) + "\t" + rs.getString( "cat_subcats" ) + "\t" + rs.getString( "cat_files" ) + "\t" + rs.getString( "cat_hidden" ) + "\n" );
108+
// outFile.close();
109+
//insertCategory( rs.getInt( "cat_id"), rs.getString( "cat_title" ), rs.getInt( "cat_pages"), rs.getInt( "cat_subcats"), rs.getInt( "cat_files"), rs.getBoolean( "cat_hidden" ) );
110+
int parentID = PageDB.getPageId( rs.getString( "cl_to" ).trim() );
111+
if ( parentID > 0 )
112+
{
113+
NodeDB.insertNode( parentID, rs.getString( "cl_to" ).trim() );
114+
// int parentID= NodeDB.getCategoryId( rs.getString( "cl_to" ) );
115+
116+
EdgeDB.insertEdge( parentID, pageID );
117+
} else
118+
{
119+
outFile1 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\Parent_child_not_inderted_to_node_table.txt", true );
120+
outFile1.append( rs.getString( "cl_to" ).trim() + "\n" );
121+
outFile1.close();
122+
}
123+
count++;
124+
125+
} while ( rs.next() );
126+
} else
127+
{
128+
129+
outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_pages_not_found_in_page_table.txt", true );
130+
outFile.append( pageID + "\t" + leafcategory + "\n" );
131+
outFile.close();
132+
133+
//System.out.println( line );
134+
// No data
135+
}
136+
137+
System.out.println( count );
138+
}
139+
}
140+
141+
connection.close();
142+
} catch ( SQLException e )
143+
{
144+
e.printStackTrace();
145+
// return 0;
146+
}
147+
148+
149+
150+
}
151+
152+
public static void insertParentChild() throws IOException, ParseException
153+
{
154+
155+
156+
FileWriter outFile;
157+
FileWriter outFile1;
158+
int pageID;
159+
// int catID;
160+
String leafcategory;
161+
162+
163+
164+
int updateQuery = 0;
165+
String temp = null;
166+
167+
168+
169+
170+
try
171+
{
172+
173+
174+
File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\page_id_page_title_leaf_categories_page_less_than_90.txt" );
175+
176+
String line;
177+
BufferedReader fileReader;
178+
fileReader = new BufferedReader( new FileReader( catPagesFile ) );
179+
//FileWriter outFile;
180+
// FileWriter outFileCatNotFound;
181+
182+
while ( ( line = fileReader.readLine() ) != null )
183+
{
184+
if ( !line.isEmpty() )
185+
{
186+
String splitLine[] = line.split( "\t" );
187+
leafcategory = splitLine[1].trim();
188+
// catID= ;
189+
pageID = PageDB.getPageId( leafcategory );
190+
NodeDB.insertNode( pageID, leafcategory );
191+
192+
/*
193+
* search index and get the cl_to by pageID
194+
*/
195+
196+
ArrayList<String> listOfClTo = Search.SearchCatPageLinks( pageID );
197+
198+
for ( int i = 0; i < listOfClTo.size(); i++ )
199+
{
200+
201+
int parentID = PageDB.getPageId( listOfClTo.get( i ) );
202+
if ( parentID > 0 )
203+
{
204+
NodeDB.insertNode( parentID, listOfClTo.get( i ) );
205+
// int parentID= NodeDB.getCategoryId( rs.getString( "cl_to" ) );
206+
207+
EdgeDB.insertEdge( parentID, pageID );
208+
} else
209+
{
210+
outFile1 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\Parent_child_not_inderted_to_node_table.txt", true );
211+
outFile1.append( listOfClTo.get( i ) + "\n" );
212+
outFile1.close();
213+
}
214+
// count++;
215+
216+
}
217+
218+
}
219+
}
220+
} catch ( Exception e )
221+
{
222+
e.printStackTrace();
223+
// return 0;
224+
}
225+
226+
227+
228+
}
229+
}

WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryProcesor.java

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,37 @@ public class CategoryProcesor
2222
*/
2323
public static void main( String[] args ) throws IOException
2424
{
25+
// inser category_only_pages
2526

27+
//File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_dir\\pages_page_namespace_14_new_complete_line.txt" );
28+
29+
File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\leaf_categories_page_less_than_90.txt" );
30+
31+
String line;
32+
BufferedReader fileReader;
33+
fileReader = new BufferedReader( new FileReader( catPagesFile ) );
34+
//FileWriter outFile;
35+
// FileWriter outFileCatNotFound;
36+
FileWriter outFile = new FileWriter("F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\page_id_page_title_leaf_categories_page_less_than_90.txt", true);
37+
38+
while ( ( line = fileReader.readLine() ) != null )
39+
{
40+
if ( !line.isEmpty() )
41+
{
42+
String splitLine[]= line.split("\t");
43+
int pageId= PageDB.getPageId( splitLine[1].trim() );
44+
outFile.append( pageId +"\t"+splitLine[1].trim()+"\n" );
45+
// CategoryLinksDB.getCategoryByPageID( );
46+
47+
48+
}
49+
}
50+
51+
outFile.close();
52+
2653

2754
// CategoryDB.getCategoryByName();
28-
55+
/*
2956
File uniqueCatNamesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_not_found_in_category_table_ca_replaced_part_3.txt" );
3057
String line;
3158
BufferedReader fileReader;
@@ -42,18 +69,23 @@ public static void main( String[] args ) throws IOException
4269
}
4370
}
4471
72+
*/
4573

46-
47-
/*
74+
/*
4875
System.out.println("Threshold \t" +"Page Count");
4976
// TODO code application logic here
50-
for(int i=1; i<10; i++){
77+
78+
79+
for(int i=1; i<100000; i++){
80+
FileWriter outFile = new FileWriter("F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\page_threshold_values.txt", true);
81+
5182
int pageCount= CategoryDB.getCategoryPageCount( i );
52-
System.out.println(i+"\t" +pageCount);
83+
outFile.append(i+"\t" +pageCount+"\n");
84+
// System.out.println(i+"\t" +pageCount);
5385
54-
86+
outFile.close();
5587
}
56-
*/
88+
*/
5789
/*
5890
5991
Scanner fileScanner = null;

WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DataProcesor.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ public static void inserDataToDB(Scanner fileScanner) throws IOException{
5353

5454

5555
//insert parent and child to the node- duplicate enties are handle by the SQL
56-
NodeDB.insertNode( parent );
56+
// NodeDB.insertNode( parent );
5757

5858
outFile1.append(parent+"\n");
59-
NodeDB.insertNode( child);
59+
// NodeDB.insertNode( child);
6060
outFile2.append(child+"\n");
6161
//get child and parent Ids
6262
parentId=NodeDB.getCategoryId( parent );

WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/NodeDB.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,22 +29,23 @@
2929
*/
3030
public class NodeDB {
3131

32-
public static void insertNode( String categoryName){
32+
public static void insertNode( int nodeID, String categoryName){
3333
DB_connection con = new DB_connection();
3434
Connection connection = con.dbConnect();
3535
PreparedStatement ps = null;
3636
ResultSet rs = null;
3737
int updateQuery = 0;
3838

39-
String query = "INSERT IGNORE INTO node(category_name,is_leaf,is_prominent) VALUES (?,?,?)";
39+
String query = "INSERT IGNORE INTO node(node_id,category_name,is_leaf,is_prominent) VALUES (?,?,?,?)";
4040

4141

4242
try
4343
{
4444
ps = connection.prepareStatement(query);
45-
ps.setString( 1, categoryName);
46-
ps.setBoolean( 2, false);
45+
ps.setInt( 1, nodeID);
46+
ps.setString( 2, categoryName);
4747
ps.setBoolean( 3, false);
48+
ps.setBoolean( 4, false);
4849
updateQuery = ps.executeUpdate();
4950

5051
// while (rs.next())

0 commit comments

Comments
 (0)