我有几个Word文档,每个文档都包含数百页的科学数据,其中包括:
- 化学式(H2SO4,所有适当的下标和上标)
- 科学数字(使用上标格式化的指数)
- 很多数学方程式。使用Word中的数学方程式编辑器编写。
问题是,将这些数据存储在Word中对我们而言效率不高。因此,我们希望将所有这些信息存储在数据库(MySQL)中。我们想将格式转换为LaTex。
有什么方法可以使用VBA遍历Word文档中的所有子脚本,上标和等式吗?
我有几个Word文档,每个文档都包含数百页的科学数据,其中包括:
问题是,将这些数据存储在Word中对我们而言效率不高。因此,我们希望将所有这些信息存储在数据库(MySQL)中。我们想将格式转换为LaTex。
有什么方法可以使用VBA遍历Word文档中的所有子脚本,上标和等式吗?
Answers:
就在这里。我会建议使用Powershell,因为它可以很好地处理Word文件。我认为我将是最简单的方法。
有关Powershell与Word自动化的更多信息,请访问:http : //www.simple-talk.com/dotnet/.net-tools/com-automation-of-office-applications-via-powershell/
我进行了更深入的研究,发现了以下PowerShell脚本:
param([string]$docpath,[string]$htmlpath = $docpath)
$srcfiles = Get-ChildItem $docPath -filter "*.doc"
$saveFormat = [Enum]::Parse([Microsoft.Office.Interop.Word.WdSaveFormat], "wdFormatFilteredHTML");
$word = new-object -comobject word.application
$word.Visible = $False
function saveas-filteredhtml
{
$opendoc = $word.documents.open($doc.FullName);
$opendoc.saveas([ref]"$htmlpath\$doc.fullname.html", [ref]$saveFormat);
$opendoc.close();
}
ForEach ($doc in $srcfiles)
{
Write-Host "Processing :" $doc.FullName
saveas-filteredhtml
$doc = $null
}
$word.quit();
将其另存为.ps1并以以下内容开头:
convertdoc-tohtml.ps1 -docpath "C:\Documents" -htmlpath "C:\Output"
它将来自指定目录的所有.doc文件另存为html文件。所以我有一个文档文件,其中有带下标的H2SO4,在powershell转换后,输出如下:
<html>
<head>
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
<meta name=Generator content="Microsoft Word 14 (filtered)">
<style>
<!--
/* Font Definitions */
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin-top:0in;
margin-right:0in;
margin-bottom:10.0pt;
margin-left:0in;
line-height:115%;
font-size:11.0pt;
font-family:"Calibri","sans-serif";}
.MsoChpDefault
{font-family:"Calibri","sans-serif";}
.MsoPapDefault
{margin-bottom:10.0pt;
line-height:115%;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
-->
</style>
</head>
<body lang=EN-US>
<div class=WordSection1>
<p class=MsoNormal><span lang=PL>H<sub>2</sub>SO<sub>4</sub></span></p>
</div>
</body>
</html>
如您所见,下标在HTML中具有自己的标记,因此剩下的唯一事情就是将bash或c ++中的文件解析为从body切成/ body,将其更改为LATEX并随后删除其余的HTML标记。
来自http://blogs.technet.com/b/bshukla/archive/2011/09/27/3347395.aspx的代码
因此,我开发了C ++解析器来查找HTML下标并将其替换为LATEX下标。
代码:
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <vector>
using namespace std;
vector < vector <string> > parse( vector < vector <string> > vec, string filename )
{
/*
PARSES SPECIFIED FILE. EACH WORD SEPARATED AND
PLACED IN VECTOR FIELD.
REQUIRED INCLUDES:
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <vector>
EXPECTS: TWO DIMENTIONAL VECTOR
STRING WITH FILENAME
RETURNS: TWO DIMENTIONAL VECTOR
vec[lines][words]
*/
string vword;
ifstream vfile;
string tmp;
// FILENAME CONVERSION FROM STING
// TO CHAR TABLE
char cfilename[filename.length()+1];
if( filename.length() < 126 )
{
for(int i = 0; i < filename.length(); i++)
cfilename[i] = filename[i];
cfilename[filename.length()] = '\0';
}
else return vec;
// OPENING FILE
//
vfile.open( cfilename );
if (vfile.is_open())
{
while ( vfile.good() )
{
getline( vfile, vword );
vector < string > vline;
vline.clear();
for (int i = 0; i < vword.length(); i++)
{
tmp = "";
// PARSING CONTENT. OMITTING SPACES AND TABS
//
while (vword[i] != ' ' && vword[i] != ((char)9) && i < vword.length() )
tmp += vword[i++];
if( tmp.length() > 0 ) vline.push_back(tmp);
}
if (!vline.empty())
vec.push_back(vline);
}
vfile.close();
}
else cout << "Unable to open file " << filename << ".\n";
return vec;
}
int main()
{
vector < vector < string > > vec;
vec = parse( vec, "parse.html" );
bool body = false;
for (int i = 0; i < vec.size(); i++)
{
for (int j = 0; j < vec[i].size(); j++)
{
if ( vec[i][j] == "<body") body=true;
if ( vec[i][j] == "</body>" ) body=false;
if ( body == true )
{
for ( int k=0; k < vec[i][j].size(); k++ )
{
if (k+4 < vec[i][j].size() )
{
if ( vec[i][j][k] == '<' &&
vec[i][j][k+1] == 's' &&
vec[i][j][k+2] == 'u' &&
vec[i][j][k+3] == 'b' &&
vec[i][j][k+4] == '>' )
{
string tmp = "";
while (vec[i][j][k+5] != '<')
{
tmp+=vec[i][j][k+5];
k++;
}
tmp = "_{" + tmp + "}";
k=k+5+5;
cout << tmp << endl;;
}
else cout << vec[i][j][k];
}
else cout << vec[i][j][k];
}
cout << endl;
}
}
}
return 0;
}
对于html文件:
<html>
<head>
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
<meta name=Generator content="Microsoft Word 14 (filtered)">
<style>
<!--
/* Font Definitions */
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin-top:0in;
margin-right:0in;
margin-bottom:10.0pt;
margin-left:0in;
line-height:115%;
font-size:11.0pt;
font-family:"Calibri","sans-serif";}
.MsoChpDefault
{font-family:"Calibri","sans-serif";}
.MsoPapDefault
{margin-bottom:10.0pt;
line-height:115%;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
-->
</style>
</head>
<body lang=EN-US>
<div class=WordSection1>
<p class=MsoNormal><span lang=PL>H<sub>2</sub>SO<sub>4</sub></span></p>
</div>
</body>
</html>
输出为:
<body
lang=EN-US>
<div
class=WordSection1>
<p
class=MsoNormal><span
lang=PL>H_{2}
SO_{4}
</span></p>
</div>
当然这不是理想的,但是将其视为概念的证明。
您可以直接从任何2007+版本的Office文档中提取xml。这是通过以下方式完成的:
word
子文件夹和文件下的提取文件夹document.xml
。那应该包含文档的所有内容。我创建了一个示例文档,并在正文标签中找到了此文档(请注意,我很快将它们放在一起,因此格式可能会有些许偏离):
<?xml version="1.0" encoding="UTF-8" standalone="true"?>
<w:body>
-<w:p w:rsidRDefault="000E0C3A" w:rsidR="008B5DAA">
-<w:r>
<w:t xml:space="preserve">This </w:t>
</w:r>
- <w:r w:rsidRPr="000E0C3A">
-<w:rPr>
<w:vertAlign w:val="superscript"/>
</w:rPr>
<w:t>is</w:t>
</w:r>
- <w:r>
<w:t xml:space="preserve"> a </w:t>
</w:r>
-<w:r w:rsidRPr="000E0C3A">
-<w:rPr>
<w:vertAlign w:val="subscript"/>
</w:rPr>
<w:t>test</w:t>
</w:r>
-<w:r>
<w:t>.</w:t>
</w:r>
</w:p>
</w:body>
看来<w:t>
标记是用于文本的,<w:rPr>
是字体的定义,<w:p>
是新的段落。
等效词看起来像这样:
我一直在寻找与mnmnc追求的方法不同的方法。
我试图将测试的Word文档另存为HTML的尝试没有成功。过去,我发现Office生成的HTML充满了烦恼,以至于挑选出想要的位几乎是不可能的。我发现情况就是这样。我也有方程式的问题。Word将方程式另存为图像。对于每个等式,将有两张图像,一张图像的扩展名是WMZ,另一张图像的扩展名是GIF。如果您使用Google Chrome浏览器显示html文件,则方程看起来不错,但效果并不理想;当使用可以处理透明图像的图像显示/编辑工具显示时,外观与GIF文件匹配。如果使用Internet Explorer显示HTML文件,则方程式看起来很完美。
附加信息
我应该在原始答案中包含此信息。
我创建了一个小的Word文档,并另存为HTML。下图中的三个面板显示原始Word文档,Microsoft Internet Explorer显示的Html文档和Google Chrome显示的Html文档。
如前所述,IE和Chrome图像之间的差异是方程保存两次的结果,一次是WMZ格式,一次是GIF格式。HTML太大,无法在此处显示。
宏创建的HTML为:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head><body>
<p>Some ordinary text.</p>
<p>H<sub>2</sub>SO<sub>4</sub>.</p>
<p>Abc & def > ghi < jkl</p>
<p>x<sup>3</sup>+ x<sup>2</sup>+3x+4=0.</p><p></p>
<p><i>Equation</i> </p>
<p>Mno</p>
<p><i>Equation</i></p>
</body></html>
显示为:
我没有尝试转换方程式,因为免费的MathType软件开发工具包显然包括转换为LaTex的例程。
该代码非常基本,因此注释不多。询问是否不清楚。注意:这是原始代码的改进版本。
Sub ConvertToHtml()
Dim FileNum As Long
Dim NumPendingCR As Long
Dim objChr As Object
Dim PathCrnt As String
Dim rng As Word.Range
Dim WithinPara As Boolean
Dim WithinSuper As Boolean
Dim WithinSub As Boolean
FileNum = FreeFile
PathCrnt = ActiveDocument.Path
Open PathCrnt & "\TestWord.html" For Output Access Write Lock Write As #FileNum
Print #FileNum, "<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Frameset//EN""" & _
" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"">" & _
vbCr & vbLf & "<html xmlns=""http://www.w3.org/1999/xhtml"" " & _
"xml:lang=""en"" lang=""en"">" & vbCr & vbLf & _
"<head><meta http-equiv=""Content-Type"" content=""text/html; " _
& "charset=utf-8"" />" & vbCr & vbLf & "</head><body>"
For Each rng In ActiveDocument.StoryRanges
NumPendingCR = 0
WithinPara = False
WithinSub = False
WithinSuper = False
Do While Not (rng Is Nothing)
For Each objChr In rng.Characters
If objChr.Font.Superscript Then
If Not WithinSuper Then
' Start of superscript
Print #FileNum, "<sup>";
WithinSuper = True
End If
ElseIf WithinSuper Then
' End of superscript
Print #FileNum, "</sup>";
WithinSuper = False
End If
If objChr.Font.Subscript Then
If Not WithinSub Then
' Start of subscript
Print #FileNum, "<sub>";
WithinSub = True
End If
ElseIf WithinSub Then
' End of subscript
Print #FileNum, "</sub>";
WithinSub = False
End If
Select Case objChr
Case vbCr
NumPendingCR = NumPendingCR + 1
Case "&"
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & "&";
Case "<"
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & "<";
Case ">"
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & ">";
Case Chr(1)
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & "<i>Equation</i>";
Case Else
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & objChr;
End Select
Next
Set rng = rng.NextStoryRange
Loop
Next
If WithinPara Then
Print #FileNum, "</p>";
withpara = False
End If
Print #FileNum, vbCr & vbLf & "</body></html>"
Close FileNum
End Sub
Function CheckPara(ByRef NumPendingCR As Long, _
ByRef WithinPara As Boolean) As String
' Have a character to output. Check paragraph status, return
' necessary commands and adjust NumPendingCR and WithinPara.
Dim RtnValue As String
RtnValue = ""
If NumPendingCR = 0 Then
If Not WithinPara Then
CheckPara = "<p>"
WithinPara = True
Else
CheckPara = ""
End If
Exit Function
End If
If WithinPara And (NumPendingCR > 0) Then
' Terminate paragraph
RtnValue = "</p>"
NumPendingCR = NumPendingCR - 1
WithinPara = False
End If
Do While NumPendingCR > 1
' Replace each pair of CRs with an empty paragraph
RtnValue = RtnValue & "<p></p>"
NumPendingCR = NumPendingCR - 2
Loop
RtnValue = RtnValue & vbCr & vbLf & "<p>"
WithinPara = True
NumPendingCR = 0
CheckPara = RtnValue
End Function
最简单的方法是在VBA中执行以下几行:
Sub testing()
With ActiveDocument.Content.Find
.ClearFormatting
.Format = True
.Font.Superscript = True
.Execute Forward:=True
End With
End Sub
这将找到所有上标的文本。如果要对其进行处理,只需将其插入方法即可。例如,要在上标中找到单词“ super”,并将其转换为“ super found”,请使用:
Sub testing()
With ActiveDocument.Content.Find
.ClearFormatting
.Format = True
.Font.Superscript = True
.Execute Forward:=True, Replace:=wdReplaceAll, _
FindText:="super", ReplaceWith:="super found"
End With
End Sub