import java.awt.datatransfer.MimeType; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.net.URL; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import pl.edu.icm.model.bwmeta.y.YContentEntry; import pl.edu.icm.model.bwmeta.y.YContentFile; import pl.edu.icm.model.bwmeta.y.YDescription; import pl.edu.icm.model.bwmeta.y.YElement; import pl.edu.icm.model.bwmeta.y.YExportable; import pl.edu.icm.model.bwmeta.y.YLanguage; import pl.edu.icm.model.transformers.bwmeta.y.BwmetaTransformerConstants; import pl.edu.icm.synat.api.services.index.fulltext.FulltextIndexService; import pl.edu.icm.synat.api.services.index.fulltext.query.FulltextSearchQuery; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.SearchOperator; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.impl.FieldCriterion; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.impl.FieldRangeCriterion; import pl.edu.icm.synat.api.services.index.fulltext.query.criteria.impl.BooleanCriterion; import pl.edu.icm.synat.api.services.index.fulltext.query.format.FieldRequest; import pl.edu.icm.synat.api.services.index.fulltext.query.format.ResultsFormat; import pl.edu.icm.synat.api.services.index.fulltext.result.FulltextSearchResult; import pl.edu.icm.synat.api.services.index.fulltext.result.FulltextSearchResults; import pl.edu.icm.synat.api.services.index.fulltext.result.ResultField; import pl.edu.icm.synat.api.services.store.PartType; import pl.edu.icm.synat.api.services.store.StatelessStore; import pl.edu.icm.synat.api.services.store.model.AbstractRecordPart; import pl.edu.icm.synat.api.services.store.model.Record; import pl.edu.icm.synat.api.services.store.model.RecordId; import pl.edu.icm.synat.api.services.store.model.TextRecordPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.AddOrUpdateRecordTextPart; import pl.edu.icm.synat.api.services.store.model.batch.operations.BatchOperations; import pl.edu.icm.synat.application.model.bwmeta.utils.BwmetaConverterUtils; import pl.edu.icm.synat.console.scripting.utils.ServicesUtils; import pl.edu.icm.synat.logic.index.publication.CollectionIndexFieldConstants; import pl.edu.icm.model.transformers.MetadataModel; import pl.edu.icm.model.transformers.bwmeta.y.BwmetaTransformerConstants; import pl.edu.icm.synat.api.services.store.StatelessStore; import pl.edu.icm.synat.api.services.store.StoreClient; import pl.edu.icm.synat.api.services.store.model.batch.impl.DefaultStoreClient; import pl.edu.icm.synat.application.model.bwmeta.transformers.SynatMetadataTransformers; import pl.edu.icm.synat.application.model.bwmeta.utils.BWMetaDeserializer; import pl.edu.icm.synat.application.model.bwmeta.utils.BWMetaDeserializerImpl; import pl.edu.icm.synat.process.common.model.impl.util.ModifiedDocumentFactoryImpl; import pl.edu.icm.synat.process.common.model.impl.util.ModifiedDocumentUtils; import pl.edu.icm.synat.process.common.writer.BasicWriterSupportImpl; import pl.edu.icm.synat.process.common.repository.DocumentRepository; import pl.edu.icm.synat.process.common.repository.ModifiedDocumentRepository; import pl.edu.icm.synat.process.common.model.api.Document; import pl.edu.icm.synat.process.common.model.api.NativeDocument; import pl.edu.icm.synat.process.common.model.api.attachment.Attachment; repository = buildRepository(); try { findAllWithoutCover(); } catch (Exception e) { e.printStackTrace(); } def void findAllWithoutCover() { FulltextIndexService index = (FulltextIndexService)serviceUtils.getService("CollectionIndex", FulltextIndexService.class); ResultsFormat resultsFormat = new ResultsFormat(new FieldRequest("thumbnailPath", true)); FulltextSearchQuery searchQuery = null; searchQuery = new FulltextSearchQuery(0, 0, resultsFormat, new FieldCriterion("level", "bwmeta1.level.hierarchy_Journal_Journal", SearchOperator.AND) // ,new FieldRangeCriterion("thumbnailPath", "", "") ); FulltextSearchResults fulltextSearchResults = index.performSearch(searchQuery); System.out.println("count="+fulltextSearchResults.getCount()); List results = fulltextSearchResults.getResults(); for (FulltextSearchResult result:results) { String docId = result.getDocId(); try { workWithDoc(docId); } catch (Exception e) { log("Error on "+docId+", "+e.getMessage()); } } } def void workWithDoc(String docId) { NativeDocument nativeDoc = repository.fetchDocument(docId); if (nativeDoc instanceof Document) { Document doc = (Document) nativeDoc; YExportable yExportable = doc.getMetadata(); if (yExportable instanceof YElement) { YElement element = (YElement)yExportable; if (needsUpdate(element)) { updateElement(doc, element) } } } } def boolean needsUpdate(YElement element) { return element.getDescriptions().isEmpty(); } def boolean updateElement(Document doc, YElement element) { System.out.println("update "+element.getId()); pageUrl = preparePageUrl(element); String aboutContent = fetchRemoteContent(pageUrl); if (aboutContent != null) { String abstractValue = parseAbstract(aboutContent); String coverUrl = findCoverUrl(aboutContent); updateDocument(doc, element, abstractValue, coverUrl); } } def boolean updateDocument(Document doc, YElement element, String abstractValue, String coverUrl) { boolean metadataModified = false; if (coverUrl != null && !hasCover(element)) { addCover(doc, element, coverUrl); metadataModified = true; } if (StringUtils.isNotEmpty(abstractValue) && !hasAbstract(element)) { YLanguage lang = detectLang(element, abstractValue); YDescription newAbstract = new YDescription(lang, abstractValue, "abstract"); element.addDescription(newAbstract); metadataModified = true; } if (metadataModified) { doc.setMetadata(element); repository.storeDocument(doc, null); } } def YLanguage detectLang(YElement element, String abstractValue) { //TODO :detect lang return YLanguage.English; } def void addCover(Document doc, YElement element, String coverUrl) { byte[] data = fetchFileContent(coverUrl); String mimeType = "image/jpeg"; URL url = new URL(coverUrl); String fileId = url.getFile(); List locations = new ArrayList(); locations.add(fileId); YContentFile newCover = new YContentFile(fileId, "cover", mimeType, locations); element.getContents().add(newCover); Attachment atachment = doc.addBinaryAttachment(fileId, data); atachment.addTagByKey("type", "source"); atachment.addTagByKey("mime", mimeType); } def boolean hasAbstract(YElement element) { boolean hasAbstract = false; for (YDescription entry:element.getDescriptions()) { if ("abstract".equals(entry.getType())) { hasAbstract = true; } } return hasAbstract; } def boolean hasCover( YElement element) { boolean hasCover = false; for (YContentEntry entry:element.getContents()) { if (entry.isFile()) { YContentFile contentFile = (YContentFile)entry; if ("cover".equals(contentFile.getType())) { hasCover = true; } } } return hasCover; } def void log(String text) { System.out.println(text); } def String parseAbstract(String aboutContent) { org.jsoup.nodes.Document document = Jsoup.parse(aboutContent); Elements divs = document.select("div[class=colLeftContentContainer]"); String result = null; if (divs.size() == 1) { Element element = divs.get(0); result = prepareText(element); if (StringUtils.isNotEmpty(result)) { // log("Abstract : "+result); } else { log("No abstract"); } } else { log("No div!"); } return result; } private String prepareText(Element div) { StringBuffer stringBuffer = new StringBuffer(); for (Element child:div.children()) { if ("springerHTML".equals(child.attr("class"))) { stringBuffer.append(child.html()); stringBuffer.append("
"); } else if ("p".equals(child.tagName())) { return stringBuffer.toString(); } } return stringBuffer.toString(); } def String preparePageUrl(YElement element) { String springerId = element.getId("bwmeta1.id-class.Springer"); if (springerId == null) { log("No springerId for "+element.getId()); } return "http://www.springer.com/journal/"+springerId+"/about"; } def String fetchRemoteContent(String path) throws HttpException, IOException { GetMethod method = null; try { URL url = new URL(path); String fileId = url.getFile(); HttpClient httpClient = new HttpClient(); method = new GetMethod(path); method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false)); // Execute the method. int statusCode = httpClient.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { throw new HttpException("Method failed: " + method.getStatusLine()); } // Read the response body. return org.apache.commons.io.IOUtils.toString(method.getResponseBodyAsStream()); } catch (HttpException e) { log("http error on "+path); throw e; } finally { if (method != null) { method.releaseConnection(); } } } private byte[] fetchFileContent(String imageUrl) { GetMethod method = null; HttpClient httpClient = new HttpClient(); method = new GetMethod(imageUrl); method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(3, false)); // Execute the method. int statusCode = httpClient.executeMethod(method); if (statusCode != HttpStatus.SC_OK) { throw new HttpException("Method failed: " + method.getStatusLine()); } return org.apache.commons.io.IOUtils.toByteArray(method.getResponseBodyAsStream()); } def String findCoverUrl(String aboutContent) { String imgTag = StringUtils.substringBetween(aboutContent, "
", "
"); String imageUrl = null; String imageUrlRegex = "]+src\\s*=\\s*['\"]([^'\"]+)['\"][^>]*>"; Pattern imageUrlPattern = Pattern.compile(imageUrlRegex); Matcher m = imageUrlPattern.matcher(imgTag); if (m.find()) { imageUrl = m.group(1); if (!imageUrl.startsWith("http")) { imageUrl = "http:" + imageUrl; } } log("cover: "+imageUrl); return imageUrl; } def DocumentRepository buildRepository() { StatelessStore statelessStore = (StatelessStore)serviceUtils.getService("Store", StatelessStore.class); ModifiedDocumentUtils modifiedDocumentUtils = new ModifiedDocumentUtils(); BWMetaDeserializer bwmetaDeserializer = new BWMetaDeserializerImpl(); modifiedDocumentUtils.setBwmetaDeserializer(bwmetaDeserializer); ModifiedDocumentFactoryImpl documentFactory = new ModifiedDocumentFactoryImpl(); documentFactory.setModifiedDocumentUtils(modifiedDocumentUtils); BasicWriterSupportImpl writerSupport = new BasicWriterSupportImpl(); writerSupport.setTargetFormat(BwmetaTransformerConstants.BWMETA_2_1); writerSupport.setTargetModel((MetadataModel)BwmetaTransformerConstants.Y); writerSupport.setTransformerFactory(SynatMetadataTransformers.BTF); StoreClient storeClient = new DefaultStoreClient(statelessStore); ModifiedDocumentRepository repository = new ModifiedDocumentRepository(); repository.setDocumentFactory(documentFactory); repository.setWriterSupport(writerSupport); repository.setStoreClient(storeClient); return repository; }