import java.awt.datatransfer.MimeType; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.net.URL; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import pl.edu.icm.model.bwmeta.y.YContentEntry; import pl.edu.icm.model.bwmeta.y.YContentFile; import pl.edu.icm.model.bwmeta.y.YDescription; import pl.edu.icm.model.bwmeta.y.YElement; import pl.edu.icm.model.bwmeta.y.YExportable; import pl.edu.icm.model.bwmeta.y.YLanguage; import pl.edu.icm.model.transformers.bwmeta.y.BwmetaTransformerConstants; import pl.edu.icm.synat.api.services.index.fulltext.FulltextIndexService; import pl.edu.icm.synat.api.services.index.fulltext.query.FulltextSearchQuery; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.SearchOperator; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.impl.FieldCriterion; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.impl.FieldRangeCriterion; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.impl.BooleanCriterion; import pl.edu.icm.synat.api.services.index.fulltext.query.format.FieldRequest; import pl.edu.icm.synat.api.services.index.fulltext.query.format.ResultsFormat; import pl.edu.icm.synat.api.services.index.fulltext.result.FulltextSearchResult; import pl.edu.icm.synat.api.services.index.fulltext.result.FulltextSearchResults; import pl.edu.icm.synat.api.services.index.fulltext.result.ResultField; import pl.edu.icm.synat.api.services.store.PartType; import pl.edu.icm.synat.api.services.store.StatelessStore; import pl.edu.icm.synat.api.services.store.model.AbstractRecordPart; import pl.edu.icm.synat.api.services.store.model.Record; import pl.edu.icm.synat.api.services.store.model.RecordId; import pl.edu.icm.synat.api.services.store.model.TextRecordPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.AddOrUpdateRecordTextPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.BatchOperations; import pl.edu.icm.synat.application.model.bwmeta.utils.BwmetaConverterUtils; import pl.edu.icm.synat.console.scripting.utils.ServicesUtils; import pl.edu.icm.synat.logic.index.publication.CollectionIndexFieldConstants; import pl.edu.icm.model.transformers.MetadataModel; import pl.edu.icm.model.transformers.bwmeta.y.BwmetaTransformerConstants; import pl.edu.icm.synat.api.services.store.StatelessStore; import pl.edu.icm.synat.api.services.store.StoreClient; import pl.edu.icm.synat.api.services.store.model.batch.impl.DefaultStoreClient; import pl.edu.icm.synat.application.model.bwmeta.transformers.SynatMetadataTransformers; import pl.edu.icm.synat.application.model.bwmeta.utils.BWMetaDeserializer; import pl.edu.icm.synat.application.model.bwmeta.utils.BWMetaDeserializerImpl; import pl.edu.icm.synat.process.common.model.impl.util.ModifiedDocumentFactoryImpl; import pl.edu.icm.synat.process.common.model.impl.util.ModifiedDocumentUtils; import pl.edu.icm.synat.process.common.writer.BasicWriterSupportImpl; import pl.edu.icm.synat.process.common.repository.DocumentRepository; import pl.edu.icm.synat.process.common.repository.ModifiedDocumentRepository; import pl.edu.icm.synat.process.common.model.api.Document; import pl.edu.icm.synat.process.common.model.api.NativeDocument; import pl.edu.icm.synat.process.common.model.api.attachment.Attachment; String provider = "springer"; repository = buildRepository(); try { findAllWithoutCover(provider); } catch (Exception e) { e.printStackTrace(); } def void findAllWithoutCover(String provider) { FulltextIndexService index = (FulltextIndexService)serviceUtils.getService("Index", FulltextIndexService.class); batchSize = 1000; ResultsFormat resultsFormat = new ResultsFormat(new FieldRequest("thumbnailPath", true)); FulltextSearchQuery searchQuery = null; searchQuery = new FulltextSearchQuery(0, batchSize, resultsFormat, new FieldCriterion("level", "bwmeta1.level.hierarchy_Journal_Journal", SearchOperator.AND) ); results = index.performSearch(searchQuery).getResults(); println "SIZE " + results.size(); skip = 0 while(results.size()>0){ skip += batchSize; for(result in results){ def docId = result.getDocId(); if(docId.contains(provider)) { try { workWithDoc(docId, provider); } catch (Exception e) { log("Error on "+docId+", "+e.getMessage()); } } } searchQuery.setFirst(skip); searchQuery.setSize(batchSize); results = index.performSearch(searchQuery).getResults(); log("results size = " + results.size()); } } def void workWithDoc(String docId, String provider) { NativeDocument nativeDoc = repository.fetchDocument(docId); if (nativeDoc instanceof Document) { Document doc = (Document) nativeDoc; YExportable yExportable = doc.getMetadata(); if (yExportable instanceof YElement) { YElement element = (YElement)yExportable; if (needsUpdate(element)) { updateElement(doc, element, provider) } } } } def boolean needsUpdate(YElement element) { return element.getDescriptions().isEmpty(); } def boolean updateElement(Document doc, YElement element, String provider) { pageUrl = this."preparePageUrl_${provider}"(element); String aboutContent = fetchRemoteContent(pageUrl); if (aboutContent != null) { String abstractValue = this."parseAbstract_${provider}"(aboutContent); String coverUrl = this."findCoverUrl_${provider}"(aboutContent); System.out.println("update "+element.getId()); updateDocument(doc, element, abstractValue, coverUrl); } } def boolean updateDocument(Document doc, YElement element, String abstractValue, String coverUrl) { boolean metadataModified = false; if (coverUrl != null && !hasCover(element)) { addCover(doc, element, coverUrl); metadataModified = true; } if (StringUtils.isNotEmpty(abstractValue) && !hasAbstract(element)) { YLanguage lang = detectLang(element, abstractValue); YDescription newAbstract = new YDescription(lang, abstractValue, "abstract"); element.addDescription(newAbstract); metadataModified = true; } if (metadataModified) { doc.setMetadata(element); repository.storeDocument(doc, null); } } def YLanguage detectLang(YElement element, String abstractValue) { //TODO :detect lang return YLanguage.English; } def void addCover(Document doc, YElement element, String coverUrl) { byte[] data = fetchFileContent(coverUrl); String mimeType = "image/jpeg"; URL url = new URL(coverUrl); String fileId = url.getFile(); List locations = new ArrayList(); locations.add(fileId); YContentFile newCover = new YContentFile(fileId, "cover", mimeType, locations); element.getContents().add(newCover); Attachment atachment = doc.addBinaryAttachment(fileId, data); atachment.addTagByKey("type", "source"); atachment.addTagByKey("mime", mimeType); } def boolean hasAbstract(YElement element) { boolean hasAbstract = false; for (YDescription entry:element.getDescriptions()) { if ("abstract".equals(entry.getType())) { hasAbstract = true; } } return hasAbstract; } def boolean hasCover( YElement element) { boolean hasCover = false; for (YContentEntry entry:element.getContents()) { if (entry.isFile()) { YContentFile contentFile = (YContentFile)entry; if ("cover".equals(contentFile.getType())) { hasCover = true; } } } return hasCover; } def void log(String text) { System.out.println(text); } def String fetchRemoteContent(String path) throws HttpException, IOException { GetMethod method = null; try { URL url = new URL(path); String fileId = url.getFile(); HttpClient httpClient = new HttpClient(); method = new GetMethod(path); method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false)); // Execute the method. int statusCode = httpClient.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { throw new HttpException("Method failed: " + method.getStatusLine()); } // Read the response body. return org.apache.commons.io.IOUtils.toString(method.getResponseBodyAsStream()); } catch (HttpException e) { log("http error on "+path); throw e; } finally { if (method != null) { method.releaseConnection(); } } } private byte[] fetchFileContent(String imageUrl) { GetMethod method = null; HttpClient httpClient = new HttpClient(); method = new GetMethod(imageUrl); method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false)); // Execute the method. int statusCode = httpClient.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { throw new HttpException("Method failed: " + method.getStatusLine()); } return org.apache.commons.io.IOUtils.toByteArray(method.getResponseBodyAsStream()); } def DocumentRepository buildRepository() { StatelessStore statelessStore = (StatelessStore)serviceUtils.getService("Store", StatelessStore.class); ModifiedDocumentUtils modifiedDocumentUtils = new ModifiedDocumentUtils(); BWMetaDeserializer bwmetaDeserializer = new BWMetaDeserializerImpl(); modifiedDocumentUtils.setBwmetaDeserializer(bwmetaDeserializer); ModifiedDocumentFactoryImpl documentFactory = new ModifiedDocumentFactoryImpl(); documentFactory.setModifiedDocumentUtils(modifiedDocumentUtils); BasicWriterSupportImpl writerSupport = new BasicWriterSupportImpl(); writerSupport.setTargetFormat(BwmetaTransformerConstants.BWMETA_2_1); writerSupport.setTargetModel((MetadataModel)BwmetaTransformerConstants.Y); writerSupport.setTransformerFactory(SynatMetadataTransformers.BTF); StoreClient storeClient = new DefaultStoreClient(statelessStore); ModifiedDocumentRepository repository = new ModifiedDocumentRepository(); repository.setDocumentFactory(documentFactory); repository.setWriterSupport(writerSupport); repository.setStoreClient(storeClient); return repository; } /* --elsevier-- */ def String findCoverUrl_elsevier(String aboutContent) { org.jsoup.nodes.Document document = Jsoup.parse(aboutContent); Elements divs = document.select("img[id=cphContent_imgCoverLink]"); if (divs.isEmpty()) { divs = document.select("img[id=cphContent_imgCover]"); } if (divs.isEmpty()) { divs = document.select("[class=cover]").select("img"); } return divs.attr("src"); } def String parseAbstract_elsevier(String aboutContent) { org.jsoup.nodes.Document document = Jsoup.parse(aboutContent); Elements divs = document.select("div[id=fullScope]"); if (divs.isEmpty()) { divs = document.select("div[class=aims_and_scope]"); } return divs.text(); } def String preparePageUrl_elsevier(YElement element) { String issn = element.getId("bwmeta1.id-class.ISSN"); if (issn == null) { log("No elsevierId for "+element.getId()); } String searchContent = fetchRemoteContent("http://www.elsevier.com/s/search.html?form=sitesearch&collection=elsevier-meta&isbn=" + issn); return findJournalLink(searchContent, issn); } def String findJournalLink(String searchContent, String issn) { org.jsoup.nodes.Document document = Jsoup.parse(searchContent); Elements divs = document.select("div[id=content]"); String result = null; if (divs != null) { divs = divs.select("h2"); if (divs != null) { divs = divs.select("a"); if (divs != null) { result = divs.attr("href"); } } } if (result.isEmpty()) { throw new RuntimeException("Issn not found: " + issn); } if (!result.startsWith("http")) { result = "http://www.elsevier.com" + result; } return result; } /* --springer-- */ def String findCoverUrl_springer(String aboutContent) { String imgTag = StringUtils.substringBetween(aboutContent, "
", "
"); String imageUrl = null; String imageUrlRegex = "]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>"; Pattern imageUrlPattern = Pattern.compile(imageUrlRegex); Matcher m = imageUrlPattern.matcher(imgTag); if (m.find()) { imageUrl = m.group(1); if (!imageUrl.startsWith("http")) { imageUrl = "http:" + imageUrl; } } log("cover: "+imageUrl); return imageUrl; } def String preparePageUrl_springer(YElement element) { String springerId = element.getId("bwmeta1.id-class.Springer"); if (springerId == null) { log("No springerId for "+element.getId()); } return "http://www.springer.com/journal/"+springerId+"/about"; } def String parseAbstract_springer(String aboutContent) { org.jsoup.nodes.Document document = Jsoup.parse(aboutContent); Elements divs = document.select("div[class=colLeftContentContainer]"); String result = null; if (divs.size() == 1) { Element element = divs.get(0); result = prepareText(element); if (StringUtils.isNotEmpty(result)) { // log("Abstract : "+result); } else { log("No abstract"); } } else { log("No div!"); } return result; } private String prepareText(Element div) { StringBuffer stringBuffer = new StringBuffer(); for (Element child:div.children()) { if ("springerHTML".equals(child.attr("class"))) { stringBuffer.append(child.html()); stringBuffer.append("
"); } else if ("p".equals(child.tagName())) { return stringBuffer.toString(); } } return stringBuffer.toString(); }