import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.h2.util.IOUtils; import org.apache.commons.lang.StringUtils; import pl.edu.icm.model.bwmeta.y.YContentEntry; import pl.edu.icm.model.bwmeta.y.YContentFile; import pl.edu.icm.model.bwmeta.y.YElement; import pl.edu.icm.model.transformers.bwmeta.y.BwmetaTransformerConstants; import pl.edu.icm.synat.api.services.index.fulltext.FulltextIndexService; import pl.edu.icm.synat.api.services.index.fulltext.query.FulltextSearchQuery; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.SearchOperator; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.impl.FieldCriterion; import pl.edu.icm.synat.api.services.index.fulltext.query.format.FieldRequest; import pl.edu.icm.synat.api.services.index.fulltext.query.format.ResultsFormat; import pl.edu.icm.synat.api.services.index.fulltext.result.FulltextSearchResult; import pl.edu.icm.synat.api.services.index.fulltext.result.FulltextSearchResults; import pl.edu.icm.synat.api.services.store.PartType; import pl.edu.icm.synat.api.services.store.StatelessStore; import pl.edu.icm.synat.api.services.store.model.AbstractRecordPart; import pl.edu.icm.synat.api.services.store.model.Record; import pl.edu.icm.synat.api.services.store.model.RecordId; import pl.edu.icm.synat.api.services.store.model.TextRecordPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.AddOrUpdateRecordBinaryPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.AddOrUpdateRecordTextPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.BatchOperations; import pl.edu.icm.synat.application.model.bwmeta.utils.BwmetaConverterUtils; import pl.edu.icm.synat.console.scripting.utils.ServicesUtils; import pl.edu.icm.synat.logic.index.publication.CollectionIndexFieldConstants; def csvFile = "/home/mkali/journals_list_oct18_2012-1.csv"; String line = ""; String cvsSplitBy = "#"; BufferedReader br = new BufferedReader(new FileReader(csvFile)); while ((line = br.readLine()) != null) { String[] cells = line.split(cvsSplitBy); String issn = cells[3]; String eissn = cells[4]; String fileName = issn; if (StringUtils.isEmpty(fileName) || "n/a".equals(fileName)) { fileName = eissn; } String aboutUrl = cells[7]; if (StringUtils.isBlank(aboutUrl)) { System.out.println("Blank aboutUrl, issn="+issn); } else { String aboutContent = fetchRemoteContent(aboutUrl); if (aboutContent != null) { String imgTag = StringUtils.substringBetween(aboutContent, "
", "
"); //System.out.println(imgTag); String imageUrlRegex = "]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>"; Pattern imageUrlPattern = Pattern.compile(imageUrlRegex); Matcher m = imageUrlPattern.matcher(imgTag); if (m.find()) { String imageUrl = m.group(1); if (!imageUrl.startsWith("http")) { imageUrl = "http:" + imageUrl; } System.out.println("imageUrl="+imageUrl); processRecord(issn, eissn, imageUrl); } } } } private String fetchRemoteContent(String path) { GetMethod method = null; try { URL url = new URL(path); String fileId = url.getFile(); HttpClient httpClient = new HttpClient(); method = new GetMethod(path); method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false)); // Execute the method. int statusCode = httpClient.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { throw new HttpException("Method failed: " + method.getStatusLine()); } // Read the response body. byte[] responseBody = method.getResponseBody(); return new String(responseBody); } catch (Exception e) { return null; } } private byte[] fetchFileContent(String path) { GetMethod method = null; URL url = new URL(path); String fileId = url.getFile(); HttpClient httpClient = new HttpClient(); method = new GetMethod(path); method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false)); // Execute the method. int statusCode = httpClient.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { throw new HttpException("Method failed: " + method.getStatusLine()); } // Read the response body. byte[] responseBody = method.getResponseBody(); return responseBody; } private void processRecord(String issn, String eissn, String imageURL) { System.out.println("issn=" + issn + ", eissn=" + eissn); //ServicesUtils serviceUtils = new ServicesUtils(); FulltextIndexService index = serviceUtils.getService("CollectionIndex", FulltextIndexService.class); ResultsFormat resultsFormat = new ResultsFormat(new FieldRequest(CollectionIndexFieldConstants.FIELD_EXID, true)); FulltextSearchQuery searchQuery = null; if (StringUtils.isNotEmpty(issn) && !"n/a".equals(issn)) { searchQuery = new FulltextSearchQuery(0, 10, resultsFormat, new FieldCriterion("externalIdentifier_#_bwmeta1.id-class.ISSN", issn, SearchOperator.AND), new FieldCriterion("level", "bwmeta1.level.hierarchy_Journal_Journal", SearchOperator.AND)); } else if (StringUtils.isNotEmpty(eissn) && !"n/a".equals(eissn)) { searchQuery = new FulltextSearchQuery(0, 10, resultsFormat, new FieldCriterion("externalIdentifier_#_bwmeta1.id-class.EISSN", eissn, SearchOperator.AND), new FieldCriterion("level", "bwmeta1.level.hierarchy_Journal_Journal", SearchOperator.AND)); } if (searchQuery != null) { FulltextSearchResults fulltextSearchResults = index.performSearch(searchQuery); List results = fulltextSearchResults.getResults(); if (results.size() == 1) { try { String docId = results.get(0).getDocId(); URL url = new URL(imageURL); String filePath = url.getFile(); String fileId = (new File(filePath)).getName(); byte[] data = fetchFileContent(imageURL); updateRecord(docId, data, fileId, "image/jpeg"); } catch (Exception e) { e.printStackTrace(); } } else { System.out.println("Wrong results size: " + results.size()); } } } private void updateRecord(String docId, byte[] data, String fileId, String mimeType) { StatelessStore store = serviceUtils.getService("Store", StatelessStore.class); RecordId recordId = new RecordId(docId) Record record = store.fetchRecord(recordId, "metadata/BWmeta-2.1.0"); AbstractRecordPart part = record.getPart("metadata/BWmeta-2.1.0"); YElement element = BwmetaConverterUtils.bwmetaToYElement(((TextRecordPart)part).getTextContent()); boolean hasCover = false; for (YContentEntry entry:element.getContents()) { if (entry.isFile()) { YContentFile contentFile = (YContentFile)entry; if ("cover".equals(contentFile.getType())) { hasCover = true; } } } if (!hasCover) { List locations = new ArrayList(); locations.add(fileId); YContentFile newCover = new YContentFile(fileId, "cover", mimeType, locations); element.getContents().add(newCover); String newBwmeta = BwmetaConverterUtils.YElementToBwmeta(element, BwmetaTransformerConstants.BWMETA_2_1); BatchOperations operationsToExecute = new BatchOperations(); operationsToExecute.getOperations().add(new AddOrUpdateRecordTextPart(recordId, PartType.SOURCE, "metadata/BWmeta-2.1.0", newBwmeta, part.getTags().toArray(new String[0]))); String[] tags = ["type:source", "mime:image/jpeg"]; operationsToExecute.getOperations().add(new AddOrUpdateRecordBinaryPart(recordId, PartType.SOURCE, fileId, new ByteArrayInputStream(data), tags)); store.executeBatch(operationsToExecute); System.out.println("Record " + record.getIdentifier() + " updated"); } }