import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.net.URL; import java.util.List; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import pl.edu.icm.model.bwmeta.y.YDescription; import pl.edu.icm.model.bwmeta.y.YElement; import pl.edu.icm.model.bwmeta.y.YLanguage; import pl.edu.icm.model.transformers.bwmeta.y.BwmetaTransformerConstants; import pl.edu.icm.synat.api.services.index.fulltext.FulltextIndexService; import pl.edu.icm.synat.api.services.index.fulltext.query.FulltextSearchQuery; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.SearchOperator; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.impl.FieldCriterion; import pl.edu.icm.synat.api.services.index.fulltext.query.format.FieldRequest; import pl.edu.icm.synat.api.services.index.fulltext.query.format.ResultsFormat; import pl.edu.icm.synat.api.services.index.fulltext.result.FulltextSearchResult; import pl.edu.icm.synat.api.services.index.fulltext.result.FulltextSearchResults; import pl.edu.icm.synat.api.services.store.PartType; import pl.edu.icm.synat.api.services.store.StatelessStore; import pl.edu.icm.synat.api.services.store.model.AbstractRecordPart; import pl.edu.icm.synat.api.services.store.model.Record; import pl.edu.icm.synat.api.services.store.model.RecordId; import pl.edu.icm.synat.api.services.store.model.TextRecordPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.AddOrUpdateRecordTextPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.BatchOperations; import pl.edu.icm.synat.application.model.bwmeta.utils.BwmetaConverterUtils; import pl.edu.icm.synat.console.scripting.utils.ServicesUtils; import pl.edu.icm.synat.logic.index.publication.CollectionIndexFieldConstants; String csvFile = "/home/mkali/journals_list_oct18_2012-1.csv"; BufferedReader br = null; String line = ""; String cvsSplitBy = "#"; try { br = new BufferedReader(new FileReader(csvFile)); while ((line = br.readLine()) != null) { // use comma as separator String[] cells = line.split(cvsSplitBy); String issn = cells[3]; String eissn = cells[4]; String aboutUrl = cells[7]; aboutUrl = aboutUrl + "?detailsPage=aboutThis"; System.out.println(aboutUrl); try { String aboutContent = fetchRemoteContent(aboutUrl); Document document = Jsoup.parse(aboutContent); Elements divs = document.select("div[class=colLeftContentContainer]"); if (divs.size() == 1) { Element element = divs.get(0); String preparedText = prepareText(element); if (StringUtils.isNotEmpty(preparedText)) { updateRecord(issn, eissn, preparedText); } } } catch (Exception e) { e.printStackTrace(); } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (br != null) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } private String prepareText(Element div) { StringBuffer stringBuffer = new StringBuffer(); for (Element child:div.children()) { if ("springerHTML".equals(child.attr("class"))) { stringBuffer.append(child.html()); stringBuffer.append("
"); } else if ("p".equals(child.tagName())) { return stringBuffer.toString(); } } return stringBuffer.toString(); } private String fetchRemoteContent(String path) throws HttpException, IOException { GetMethod method = null; try { URL url = new URL(path); String fileId = url.getFile(); HttpClient httpClient = new HttpClient(); method = new GetMethod(path); method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false)); // Execute the method. int statusCode = httpClient.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { throw new HttpException("Method failed: " + method.getStatusLine()); } // Read the response body. byte[] responseBody = method.getResponseBody(); return new String(responseBody); } finally { if (method != null) { method.releaseConnection(); } } } private void updateRecord(String issn, String eissn, String data) { FulltextIndexService index = (FulltextIndexService)serviceUtils.getService("CollectionIndex", FulltextIndexService.class); ResultsFormat resultsFormat = new ResultsFormat(new FieldRequest(CollectionIndexFieldConstants.FIELD_EXID, true)); FulltextSearchQuery searchQuery = null; if (StringUtils.isNotEmpty(issn)) { searchQuery = new FulltextSearchQuery(0, 10, resultsFormat, new FieldCriterion("externalIdentifier_#_bwmeta1.id-class.ISSN", issn, SearchOperator.AND), new FieldCriterion("level", "bwmeta1.level.hierarchy_Journal_Journal", SearchOperator.AND)); } else if (StringUtils.isNotEmpty(eissn)) { searchQuery = new FulltextSearchQuery(0, 10, resultsFormat, new FieldCriterion("externalIdentifier_#_bwmeta1.id-class.EISSN", eissn, SearchOperator.AND), new FieldCriterion("level", "bwmeta1.level.hierarchy_Journal_Journal", SearchOperator.AND)); } if (searchQuery != null) { FulltextSearchResults fulltextSearchResults = index.performSearch(searchQuery); List results = fulltextSearchResults.getResults(); for (FulltextSearchResult result:results) { System.out.println(result.getDocId()); } if (results.size() == 1) { String docId = results.get(0).getDocId(); StatelessStore store = (StatelessStore)serviceUtils.getService("Store", StatelessStore.class); Record record = store.fetchRecord(new RecordId(docId), "metadata/BWmeta-2.1.0"); AbstractRecordPart part = record.getPart("metadata/BWmeta-2.1.0"); YElement element = BwmetaConverterUtils.bwmetaToYElement(((TextRecordPart)part).getTextContent()); boolean hasAbstract = false; for (YDescription entry:element.getDescriptions()) { if ("abstract".equals(entry.getType())) { hasAbstract = true; } } if (!hasAbstract) { YDescription newAbstract = new YDescription(YLanguage.English, data, "abstract"); element.getDescriptions().add(newAbstract); String newBwmeta = BwmetaConverterUtils.YElementToBwmeta(element, BwmetaTransformerConstants.BWMETA_2_1); BatchOperations operationsToExecute = new BatchOperations(); operationsToExecute.getOperations().add(new AddOrUpdateRecordTextPart(record.getIdentifier(), PartType.SOURCE, "metadata/BWmeta-2.1.0", newBwmeta, part.getTags().toArray(new String[0]))); store.executeBatch(operationsToExecute); System.out.println("Record " + record.getIdentifier() + " updated"); } else { System.out.println("Wrong results size: " + results.size()); } } } }